PRODUCT LIST:

ORAL B ŞARJLI DİŞ FIRÇASI

Sleepy Bebek Islak Mendil

La Roche Posay Yüz Temizleyici

Fakir Dik Süpürge

Xiaomi Bluetooth Kulaklık

Trendyolmilla Tayt

Trendyolmilla Bikini 2

Trendyolmilla Bikini1

Altinyildiz Classics Mont

library(jsonlite)
library(httr)
library(data.table)
library(ggplot2)
library(ggcorrplot)
library(readr)
library(tsibble)
library(zoo)
library(forecast)
library(mgcv)
library(lubridate)
library(urca)
library(Metrics)
library(mgcv)
library(gratia)
options(repr.plot.width = 10, repr.plot.height = 4)
accu=function(actual,forecast){
  n=length(actual)
  error=actual-forecast
  mean=mean(actual)
  sd=sd(actual)
  CV=sd/mean
  FBias=sum(error)/sum(actual)
  MAPE=sum(abs(error/actual))/n
  RMSE=sqrt(sum(error^2)/n)
  MAD=sum(abs(error))/n
  MADP=sum(abs(error))/sum(abs(actual))
  WMAPE=MAD/mean
  l=data.frame(n,mean,sd,CV,FBias,MAPE,RMSE,MAD,MADP,WMAPE)
  return(l)
}

get_token <- function(username, password, url_site){
  
  post_body = list(username=username,password=password)
  post_url_string = paste0(url_site,'/token/')
  result = POST(post_url_string, body = post_body)
  
  # error handling (wrong credentials)
  if(result$status_code==400){
    print('Check your credentials')
    return(0)
  }
  else if (result$status_code==201){
    output = content(result)
    token = output$key
  }
  
  return(token)
}

get_data <- function(start_date="1", token, url_site){
  
  post_body = list(start_date=start_date,username=username,password=password)
  post_url_string = paste0(url_site,'/dataset/')
  
  header = add_headers(c(Authorization=paste('Token',token,sep=' ')))
  result = GET(post_url_string, header, body = post_body)
  output = content(result)
  data = data.table::rbindlist(output)
  data[,event_date:=as.Date(event_date)]
  data = data[order(product_content_id,event_date)]
  return(data)
}
subm_url = 'http://46.101.163.177'

u_name = "Group4"
p_word = "a4TStQDQYjpverak"
submit_now = FALSE

username = u_name
password = p_word

token = get_token(username=u_name, password=p_word, url=subm_url)
data_son = get_data(token=token,url=subm_url)

ProjectRawData <- read_csv("ProjectRawData.csv", 
                           col_types = cols(event_date = col_date(format = "%Y-%m-%d")))
data <- data.table(ProjectRawData)
data <- data.table(rbind(data,data_son[event_date > max(data$event_date)]))[order(event_date)]


discount_dates <- as.Date(c("2021-03-10", "2021-03-11", "2021-03-12",
                            "2021-02-13", "2021-02-14", "2021-02-12",
                            "2020-12-31", "2020-12-30","2020-12-29",
                            "2020-11-25", "2020-11-26", "2020-11-27", "2020-11-28", "2020-11-29",
                            "2020-11-09", "2020-11-10","2020-11-11",
                            "2020-09-10", "2020-09-11", "2020-09-12",
                            "2020-06-18", "2020-06-19", "2020-06-20",
                            "2021-05-07","2021-05-08", "2021-05-09"))

data <- data[,special_days:=0]
data[data$event_date %in% discount_dates]$special_days <- 1
data$ratio <- data$category_sold / data$category_favored
dt <- data[event_date < "2021-05-28"]
test_set <- data[event_date >= "2021-05-28" & event_date <= "2021-06-11"]
train_start=as.Date('2020-05-25')
test_start=as.Date('2021-05-28')
test_end=as.Date('2021-06-11')
test_dates=seq(test_start,test_end,by='day')
result_start=as.Date('2021-06-12')
result_end=as.Date('2021-06-26')
result_dates=seq(result_start,result_end,by='day')
#data seperation part
#Mont, ALTINYILDIZ CLASSICS
prod1 <- dt[product_content_id==48740784,]
prod1 <- prod1[order(event_date)]
test_set1 <- test_set[product_content_id==48740784,]
test_set1 <- test_set1[order(event_date)]
#Bikini Ustu 1, TRENDYOLMILLA
prod2 <- dt[product_content_id==73318567,]
prod2 <- prod2[order(event_date)]
test_set2 <- test_set[product_content_id==73318567,]
test_set2 <- test_set2[order(event_date)]
#Bikini Ustu 2, TRENDYOLMILLA
prod3 <- dt[product_content_id==32737302,]
prod3 <- prod3[order(event_date)]
test_set3 <- test_set[product_content_id==32737302,]
test_set3 <- test_set3[order(event_date)]
#Tayt, TRENDYOLMILLA
prod4 <- dt[product_content_id==31515569,]
prod4 <- prod4[order(event_date)]
test_set4 <- test_set[product_content_id==31515569,]
test_set4 <- test_set4[order(event_date)]
#Bluetooth Kulaklik, Xiaomi
prod5 <- dt[product_content_id==6676673,]
prod5 <- prod5[order(event_date)]
test_set5 <- test_set[product_content_id==6676673,]
test_set5 <- test_set5[order(event_date)]
#Dik Supurge, Fakir
prod6 <- dt[product_content_id==7061886,]
prod6 <- prod6[order(event_date)]
test_set6 <- test_set[product_content_id==7061886,]
test_set6 <- test_set6[order(event_date)]
#Yuz Temizleyici, La Roche Posay
prod7 <- dt[product_content_id==85004,]
prod7 <- prod7[order(event_date)]
test_set7 <- test_set[product_content_id==85004,]
test_set7 <- test_set7[order(event_date)]
#Bebek Islak Mendil, Sleepy
prod8 <- dt[product_content_id==4066298,]
prod8 <- prod8[order(event_date)]
test_set8 <- test_set[product_content_id==4066298,]
test_set8 <- test_set8[order(event_date)]
#Sarj Edebilir Dis Fircasi, Oral-B
prod9 <- dt[product_content_id==32939029,]
prod9 <- prod9[order(event_date)]
test_set9 <- test_set[product_content_id==32939029,]
test_set9 <- test_set9[order(event_date)]

ORAL B ŞARJLI DİŞ FIRÇASI

ggplot(prod9, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod9$sold_count) 

From the ACF tableau above, it seems that there is a trend. Also, there are peak points at lag 7 which indicates that there might be weekly seasonality.

Weekly Decomposition

Because the variance increases over time, the multiplicative decomposition method will be used.

prod9_weekly_dec <- decompose(ts(prod9$sold_count, freq=7), type="multiplicative")
plot(prod9_weekly_dec)

plot(prod9_weekly_dec$random)

Now, let’s check whether the detrended and deseasonalized data is stationary.

u_test <- ur.kpss(prod9_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.1869 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the KPSS test above, it can be said that the data is stationary.

acf(prod9_weekly_dec$random, na.action=na.pass)
pacf(prod9_weekly_dec$random, na.action=na.pass)

There are slightly high autocorrelation at lag 1 and lag 3. From PACF above, it seems that there are low correlations with previous 3 observations in the weekly decomposed data, but the data is still stationary.

Monthly Decomposition

prod9_monthly <- prod9[, month:=yearmonth(event_date)]
prod9_monthly[, .(sold_count=sum(sold_count)), by=month]
prod9_monthly[, trend:=1:.N]

prod9_reg <- lm(sold_count~trend+as.factor(month), data=prod9_monthly)
prod9_monthly[, regOutput:=predict(prod9_reg, prod9_monthly)]

prod9_monthly[, random:=sold_count-regOutput]

ggplot(prod9_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 476
2020 Jun 824
2020 Jul 426
2020 Aug 1020
2020 Sep 1883
2020 Oct 1102
2020 Nov 1502
2020 Dec 2128
2021 Jan 2460
2021 Feb 2963
2021 Mar 4266
2021 Apr 6575
2021 May 8098

Now, let’s check whether the monthly detrended and deseasonalized data is stationary.

u_test <- ur.kpss(prod9_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.015 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the KPSS unit test, we cannot reject the null hyphotesis. The data seems to be stationary.

acf(prod9_monthly$random, na.action=na.pass)
pacf(prod9_monthly$random, na.action=na.pass)

From the PACF graph, there is a correlation with lag 1 in the data.

Building an ARIMA Model

disf <- data[product_content_id=="32939029",][order(event_date)]
disf_train <- disf[event_date<test_start,]
disf_ts <- ts(disf_train$sold_count,freq=7)

After the decomposition, it could be said that the chargable toothbrush has a weekly seasonality. Differencing could make the series stationary.

ggplot(disf_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(disf_train$sold_count,1))) + 
labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

After the differencing, the data seems stationary but it has some outliers and non-constant variance.

acf(diff(disf_train$sold_count,1))
pacf(diff(disf_train$sold_count,1))

At the ACF graph, there is a spike at lag 2 which indicates MA(2) model and also there is slighlty significant spike at lag 7 which is the seasonal MA(1). At the PACF graph, there is spikes at lag 2 and 4. AR(2) and AR(4) models could be applied.

arima(disf_ts,order=c(2,1,2),seasonal=c(0,0,1))
Call:
arima(x = disf_ts, order = c(2, 1, 2), seasonal = c(0, 0, 1))

Coefficients:
         ar1      ar2      ma1     ma2    sma1
      1.7679  -0.8004  -1.9727  0.9804  0.1349
s.e.  0.0345   0.0340   0.0171  0.0168  0.0555

sigma^2 estimated as 1777:  log likelihood = -1895.31,  aic = 3802.63
arima(disf_ts,order=c(4,1,2),seasonal=c(0,0,1))
Call:
arima(x = disf_ts, order = c(4, 1, 2), seasonal = c(0, 0, 1))

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2    sma1
      0.8586  -0.1346  0.0465  -0.0526  -1.0146  0.0604  0.1120
s.e.  0.8228   0.6586  0.0934   0.0545   0.8243  0.7871  0.0571

sigma^2 estimated as 1834:  log likelihood = -1900.03,  aic = 3816.06
auto.arima(disf_ts)
Series: disf_ts 
ARIMA(2,1,2)(0,0,1)[7] with drift 

Coefficients:
         ar1      ar2      ma1     ma2    sma1   drift
      1.7675  -0.8063  -1.9672  0.9751  0.1516  0.7005
s.e.  0.0351   0.0345   0.0177  0.0176  0.0550  0.5119

sigma^2 estimated as 1803:  log likelihood=-1894.91
AIC=3803.82   AICc=3804.13   BIC=3831.16

After trying some SARIMA models, auto.arima function gives the nearly same result with the SARIMA(2,1,2)(0,0,1). We could use our model to forecasting.

model_disf_sarima <- arima(disf_ts,order=c(2,1,2),seasonal=c(0,0,1))
checkresiduals(model_disf_sarima)
    Ljung-Box test

data:  Residuals from ARIMA(2,1,2)(0,0,1)[7]
Q* = 8.7094, df = 9, p-value = 0.4645

Model df: 5.   Total lags used: 14

Residuals look stationary with constant mean at 0. Variance isn’t constant but there is not any significant correlations at ACF graph. The distribution seems normal.

disf_train$res1 <- model_disf_sarima$residuals
corr <- cor(disf_train[!is.na(disf_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")

Residuals have highest correlation with basket_count which 0.4. Also it still has correlation with output variable which means there is still unexplained data in sold_count. We could add basket_count as a regressor.

model_disf_sarimax <- arima(disf_ts,order=c(2,1,2),seasonal=c(0,0,1),xreg=disf_train$basket_count)
disf_train$res2 <- model_disf_sarimax$residuals
model_disf_sarimax
checkresiduals(model_disf_sarimax)
Call:
arima(x = disf_ts, order = c(2, 1, 2), seasonal = c(0, 0, 1), xreg = disf_train$basket_count)

Coefficients:
         ar1     ar2      ma1      ma2    sma1  disf_train$basket_count
      0.4948  0.1012  -0.9428  -0.0173  0.0144                   0.2016
s.e.  0.3293  0.1832   0.3275   0.3085  0.0591                   0.0069

sigma^2 estimated as 584:  log likelihood = -1690.23,  aic = 3394.47



    Ljung-Box test

data:  Residuals from ARIMA(2,1,2)(0,0,1)[7]
Q* = 8.6399, df = 8, p-value = 0.3736

Model df: 6.   Total lags used: 14

After the addition of the baskes_count as a regressor, AIC value is reduced to 3394 from 3804. Residuals didn’t change much after the addition of the regressor.

corr <- cor(disf_train[!is.na(disf_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")

Residuals still have correlation with category_visits but not significantly, it could be added as a regressor.

model_disf_sarimax2 <- arima(disf_ts,order=c(2,1,2),seasonal=c(0,0,1),xreg=as.matrix(disf_train$basket_count,disf_train$category_visits))
disf_train$res3 <- model_disf_sarimax2$residuals
model_disf_sarimax2
Call:
arima(x = disf_ts, order = c(2, 1, 2), seasonal = c(0, 0, 1), xreg = as.matrix(disf_train$basket_count, 
    disf_train$category_visits))

Coefficients:
         ar1     ar2      ma1      ma2    sma1
      0.4948  0.1012  -0.9428  -0.0173  0.0144
s.e.  0.3293  0.1832   0.3275   0.3085  0.0591
      as.matrix(disf_train$basket_count, disf_train$category_visits)
                                                              0.2016
s.e.                                                          0.0069

sigma^2 estimated as 584:  log likelihood = -1690.23,  aic = 3394.47

The addition of the new regressor didn’t change the AIC value. So, the previous SARIMAX model could be use in forecasting.

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod9[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model9_1 <- lm(sold_count~trend+days, data=prod9)
summary(model9_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod9)

Residuals:
     Min       1Q   Median       3Q      Max 
-132.639  -52.786   -4.228   25.393  312.511 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -24.5927     7.5574  -3.254  0.00125 ** 
trend         0.6295     0.0355  17.733  < 2e-16 ***
days.L      -11.1555    10.0111  -1.114  0.26589    
days.Q      -24.6844     9.9930  -2.470  0.01397 *  
days.C        6.6013     9.9841   0.661  0.50892    
days^4        4.9652     9.9780   0.498  0.61906    
days^5       -0.5982     9.9568  -0.060  0.95213    
days^6       -1.4376     9.9403  -0.145  0.88509    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 72.34 on 360 degrees of freedom
Multiple R-squared:  0.4726,    Adjusted R-squared:  0.4623 
F-statistic: 46.08 on 7 and 360 DF,  p-value: < 2.2e-16
checkresiduals(model9_1)
    Breusch-Godfrey test for serial correlation of order up to 11

data:  Residuals
LM test = 241.05, df = 11, p-value < 2.2e-16

prod9[, residuals1:=model9_1$residuals]
corr <- round(cor(Filter(is.numeric, prod9[!is.na(prod9$price)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod9$residuals1, prod9$basket_count)

model9_2 <- lm(sold_count~trend+days+shift(basket_count, 1), data=prod9)
summary(model9_2)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1), data = prod9)

Residuals:
     Min       1Q   Median       3Q      Max 
-120.092  -19.837   -3.592   13.588  210.497 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -4.046992   4.848546  -0.835   0.4045    
trend                    0.131437   0.030881   4.256 2.66e-05 ***
days.L                 -13.237320   6.282332  -2.107   0.0358 *  
days.Q                  -3.623403   6.324899  -0.573   0.5671    
days.C                   7.632546   6.266052   1.218   0.2240    
days^4                   0.713755   6.274741   0.114   0.9095    
days^5                  -4.971861   6.252058  -0.795   0.4270    
days^6                  -1.699824   6.230668  -0.273   0.7852    
shift(basket_count, 1)   0.183713   0.007801  23.549  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 45.32 on 358 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.7941,    Adjusted R-squared:  0.7895 
F-statistic: 172.5 on 8 and 358 DF,  p-value: < 2.2e-16
checkresiduals(model9_2)
    Breusch-Godfrey test for serial correlation of order up to 12

data:  Residuals
LM test = 53.348, df = 12, p-value = 3.572e-07

prod9[, residuals2:=c(NA, model9_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod9[!is.na(prod9$price) & !is.na(prod9$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals2 is obtained by category_sold. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod9$residuals2, prod9$category_sold, na.action=na.pass)

model9_3 <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(category_sold, 1), data=prod9)
summary(model9_3)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(category_sold, 1), data = prod9)

Residuals:
     Min       1Q   Median       3Q      Max 
-119.668  -19.856   -3.278   13.009  210.703 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)              -5.067239   5.620746  -0.902   0.3679    
trend                     0.132269   0.031005   4.266 2.55e-05 ***
days.L                  -13.611901   6.375415  -2.135   0.0334 *  
days.Q                   -3.338151   6.381954  -0.523   0.6013    
days.C                    7.915130   6.322570   1.252   0.2114    
days^4                    0.688536   6.282772   0.110   0.9128    
days^5                   -4.978390   6.259698  -0.795   0.4270    
days^6                   -1.722637   6.238577  -0.276   0.7826    
shift(basket_count, 1)    0.182818   0.008197  22.303  < 2e-16 ***
shift(category_sold, 1)   0.001185   0.003290   0.360   0.7190    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 45.38 on 357 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.7941,    Adjusted R-squared:  0.7889 
F-statistic:   153 on 9 and 357 DF,  p-value: < 2.2e-16

Although we added a new attribute to the model, the adjusted Rsqr value decreased. Also, category_sold is not significant according to the model output. Therefore, it’ll be removed from the model. Now, let’s check if the residuals2 is related with the lagged version of the input variable.

ccf(prod9$residuals2, prod9$sold_count, na.action=na.pass)

model9_4 <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=prod9)
summary(model9_4)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(sold_count, 1), data = prod9)

Residuals:
     Min       1Q   Median       3Q      Max 
-154.121  -17.791   -3.614   13.348  211.074 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -4.11215    4.49509  -0.915 0.360910    
trend                    0.10774    0.02879   3.742 0.000213 ***
days.L                 -16.95943    5.84430  -2.902 0.003940 ** 
days.Q                  -3.19804    5.86407  -0.545 0.585845    
days.C                   9.67218    5.81527   1.663 0.097142 .  
days^4                   1.21133    5.81767   0.208 0.835179    
days^5                  -3.46145    5.79959  -0.597 0.550990    
days^6                  -1.83813    5.77648  -0.318 0.750512    
shift(basket_count, 1)   0.06283    0.01726   3.641 0.000312 ***
shift(sold_count, 1)     0.56367    0.07307   7.715 1.22e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 42.02 on 357 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.8235,    Adjusted R-squared:  0.819 
F-statistic: 185.1 on 9 and 357 DF,  p-value: < 2.2e-16
checkresiduals(model9_4)
    Breusch-Godfrey test for serial correlation of order up to 13

data:  Residuals
LM test = 13.701, df = 13, p-value = 0.3952

So, the final model is model9_4.

prod9[, preds:=c(NA,model9_4$fitted.values)]
ggplot(prod9, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red") 

Forecasting

forecast_data_disf <- data.table(event_date=disf[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=disf[event_date>=test_start&event_date<=test_end,]$sold_count)
disf[,`:=`(trend=1:.N,
          days=wday(event_date, label=T))]
sarima_fc <- numeric(0)
sarimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- disf[event_date<test_dates[i],]
  model_sarima <- Arima(train_dt$sold_count,order=c(2,1,2),seasonal=c(0,0,1))
  model_sarimax <- Arima(train_dt$sold_count,order=c(2,1,2),seasonal=c(0,0,1),xreg=train_dt$basket_count)
  model_reg <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$basket_count),h=1)$mean[1]
  sarima_temp <- forecast(model_sarima)
  sarimax_temp <- forecast(model_sarimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=disf[event_date %in% c(test_dates[i],test_dates[i+1])])
  sarima_fc <- c(sarima_fc,sarima_temp$mean[1])
  sarimax_fc <- c(sarimax_fc,sarimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_disf <- forecast_data_disf[,`:=`(sarima_p=sarima_fc,
                                               sarimax_p=sarimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_disf$sold_count,forecast_data_disf$sarima_p)
accu(forecast_data_disf$sold_count,forecast_data_disf$sarimax_p)
accu(forecast_data_disf[1:(.N-1)]$sold_count,forecast_data_disf[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 149.4667 68.00826 0.4550062 -0.1823611 0.3790575 64.45033 49.22688 0.3293503 0.3293503
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 149.4667 68.00826 0.4550062 -0.1858003 0.3648424 61.69227 48.17218 0.3222938 0.3222938
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 154.7143 67.35032 0.4353206 -0.05606837 0.1490131 21.98073 20.64795 0.1334586 0.1334586
result_data_disf <- data.table(event_date=disf[event_date>=result_start&event_date<=result_end,]$event_date,
                            sold_count=disf[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- disf[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=train_dt)
  final_temp <- predict(model_final,newdata=disf[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_disf <- result_data_disf[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_disf[1:(.N-1)]$sold_count,result_data_disf[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 122 68.43751 0.5609632 -0.06905548 0.1862189 17.45269 15.73057 0.1289391 0.1289391
data.frame(event_date=result_data_disf[1:(.N-1)]$event_date,
           sold_count=result_data_disf[1:(.N-1)]$sold_count,
           predictions=result_data_disf[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 83 108.26948
2021-06-13 115 136.54818
2021-06-14 245 232.44108
2021-06-15 265 243.71517
2021-06-16 169 162.21124
2021-06-17 135 124.49187
2021-06-18 108 109.24042
2021-06-19 104 121.13375
2021-06-20 149 165.71563
2021-06-21 128 142.33474
2021-06-22 56 88.13032
2021-06-23 59 71.74940
2021-06-24 56 66.22381
2021-06-25 36 53.74168

Sleepy Bebek Islak Mendil

ggplot(prod8, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod8$sold_count)

The ACF graph is like sinus wave. Next steps below, the data will be decomposed at different levels. Because the variance is not increasing over time, additive decomposition will be used.

prod8_weekly_dec <- decompose(ts(prod8$sold_count, freq=7), type="additive")
plot(prod8_weekly_dec)

plot(prod8_weekly_dec$random)

Now, let’s check whether the detrended and deseasonalized data is stationary.

u_test <- ur.kpss(prod9_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.1869 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

There is not enough evidence to reject the null hyphotesis. The data is stationary.

acf(prod8_weekly_dec$random, na.action=na.pass)
pacf(prod8_weekly_dec$random, na.action=na.pass)

The correlation values are not high. From the ACF, it can be said that the data is correlated with its previous three observations.

Monthly Decomposition

prod8_monthly <- prod8[, month:=yearmonth(event_date)]
prod8_monthly[, .(sold_count=sum(sold_count)), by=month]
prod8_monthly[, trend:=1:.N]

prod8_reg <- lm(sold_count~trend+as.factor(month), data=prod8_monthly)
prod8_monthly[, regOutput:=predict(prod8_reg, prod8_monthly)]

prod8_monthly[, random:=sold_count-regOutput]

ggplot(prod8_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 1181
2020 Jun 6379
2020 Jul 9588
2020 Aug 13341
2020 Sep 10967
2020 Oct 14086
2020 Nov 20286
2020 Dec 7932
2021 Jan 5088
2021 Feb 10201
2021 Mar 11813
2021 Apr 18433
2021 May 10917

u_test <- ur.kpss(prod9_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.015 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

Although the mean and variance are not constant over time, the test says that the data above is stationary.

acf(prod8_monthly$random, na.action=na.pass)
pacf(prod8_monthly$random, na.action=na.pass)

There is a high correlation with lag 1.

Building an ARIMA Model

mendil <- data[product_content_id=="4066298",][order(event_date)]
mendil_train <- mendil[event_date<test_start,]
mendil_ts <- ts(mendil_train$sold_count)
ggplot(mendil_train,aes(x=event_date,y=sold_count)) + geom_line() +
labs(title= "Graph of Sleepy Towel", x= "Date", y="Quantity")

There are a lot of outliers in the data, mean isn’t constant and also variance varies through time. Differencing should be applied to the series.

ggplot(mendil_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(mendil_train$sold_count,1))) + 
labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(mendil_train$sold_count,1))
pacf(diff(mendil_train$sold_count,1))

At the ACF graph, there is a spike at lags 3 and 4 which could be MA(3) or MA(4) model. Also, there is spikes at PACF graph at lags 3 and 4 which indicates AR(3) OR AR(4) models. Seasonality is not occured in the decomposition steps so, ARIMA model could be consturcted.

arima(mendil_ts,order=c(3,1,3))
Call:
arima(x = mendil_ts, order = c(3, 1, 3))

Coefficients:
         ar1     ar2    ar3      ma1      ma2     ma3
      0.7707  0.6181  -0.55  -1.0559  -0.6087  0.6751
s.e.     NaN     NaN    NaN      NaN      NaN     NaN

sigma^2 estimated as 93717:  log likelihood = -2622.65,  aic = 5259.29
arima(mendil_ts,order=c(3,1,4))
Call:
arima(x = mendil_ts, order = c(3, 1, 4))

Coefficients:
         ar1     ar2      ar3      ma1      ma2     ma3      ma4
      0.1889  0.7175  -0.4350  -0.4573  -0.8720  0.4191  -0.0897
s.e.  0.5148  0.2188   0.2896   0.5153   0.1393  0.4993   0.1485

sigma^2 estimated as 92894:  log likelihood = -2622.29,  aic = 5260.57
arima(mendil_ts,order=c(4,1,3))
Call:
arima(x = mendil_ts, order = c(4, 1, 3))

Coefficients:
         ar1    ar2      ar3     ar4      ma1      ma2    ma3
      0.7162  0.806  -0.6826  0.0393  -0.9811  -0.8535  0.845
s.e.     NaN    NaN      NaN  0.0560      NaN      NaN    NaN

sigma^2 estimated as 93470:  log likelihood = -2622.17,  aic = 5260.34
arima(mendil_ts,order=c(4,1,4))
Call:
arima(x = mendil_ts, order = c(4, 1, 4))

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2      ma3     ma4
      1.6669  -0.9737  0.3334  -0.1017  -1.9397  1.2318  -0.5152  0.2292
s.e.  1.1285   2.9159  2.5896   0.7923   1.1270  3.2275   3.1429  1.0390

sigma^2 estimated as 93355:  log likelihood = -2621.91,  aic = 5261.83
auto.arima(mendil_ts)
Series: mendil_ts 
ARIMA(2,0,1) with non-zero mean 

Coefficients:
         ar1      ar2      ma1      mean
      1.4711  -0.5954  -0.7410  380.7045
s.e.  0.1339   0.0841   0.1475   33.2133

sigma^2 estimated as 94207:  log likelihood=-2627.9
AIC=5265.8   AICc=5265.97   BIC=5285.34

The best ARIMA model is (3,1,3) with the lowest AIC value of 5259.29.

model_mendil_arima <- arima(mendil_ts,order=c(3,1,3))
checkresiduals(model_mendil_arima)
mendil_train$res1 <- model_mendil_arima$residuals
corr <- cor(mendil_train[!is.na(mendil_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(3,1,3)
Q* = 2.4773, df = 4, p-value = 0.6487

Model df: 6.   Total lags used: 10

Redisuals of the first model isn’t autocorrelated and mean is constant at 0. Variance is mostly constant through time. Distribution looks normal but slightly right skewed. Residuals still correlated with sold_count which is output variable and also with basket_count.

ggplot(mendil_train,aes(x=res1,y=basket_count)) +  geom_point() + geom_smooth()

ggplot(mendil_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(mendil_train$basket_count,1))) +
geom_smooth(aes(y=diff(mendil_train$basket_count,1)))

cor(mendil_train[2:.N]$res1,diff(mendil_train$basket_count,1))
mendil_train$diff_basket <- c(NA,diff(mendil_train$basket_count,1))
mendil$diff_basket <- c(0,diff(mendil$basket_count,1))

0.675450069295372

The correlation between residuals and basket_count is non-linear but differencing the basket_count make this relation linear and with a correlation 0.675. We could add differenced basket_count as a regressor.

arima(mendil_ts,order=c(3,1,3),xreg=mendil_train$diff_basket)
Call:
arima(x = mendil_ts, order = c(3, 1, 3), xreg = mendil_train$diff_basket)

Coefficients:
         ar1     ar2      ar3      ma1      ma2     ma3
      0.6481  0.4425  -0.3994  -0.8053  -0.6060  0.4141
s.e.  0.4583  0.4959   0.2100   0.4639   0.5492  0.2354
      mendil_train$diff_basket
                        0.1360
s.e.                    0.0143

sigma^2 estimated as 76265:  log likelihood = -2578.55,  aic = 5173.11

After the addition of regressor, AIC value is reduced slightly.

model_mendil_arimax <- arima(mendil_ts,order=c(3,1,3),xreg=mendil_train$diff_basket)
checkresiduals(model_mendil_arimax)
mendil_train$res2 <- model_mendil_arimax$residuals
corr <- cor(mendil_train[!is.na(mendil_train$price)& !is.na(mendil_train$res2),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(3,1,3)
Q* = 3.3769, df = 3, p-value = 0.3371

Model df: 7.   Total lags used: 10

Residuals are more stationary than the previous model. Variance looks more constant and the distribution have mean near to the zero.

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod8[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model8_1 <- lm(sold_count~trend+days, data=prod8)
summary(model8_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod8)

Residuals:
   Min     1Q Median     3Q    Max 
-423.4 -213.5 -102.4   40.2 3767.2 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  306.9811    43.1464   7.115 6.09e-12 ***
trend          0.3968     0.2027   1.958  0.05101 .  
days.L       -79.7345    57.1547  -1.395  0.16386    
days.Q      -215.1357    57.0515  -3.771  0.00019 ***
days.C        52.2256    57.0005   0.916  0.36016    
days^4        11.4518    56.9659   0.201  0.84079    
days^5        24.9918    56.8447   0.440  0.66045    
days^6        31.2106    56.7503   0.550  0.58269    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 413 on 360 degrees of freedom
Multiple R-squared:  0.05607,   Adjusted R-squared:  0.03771 
F-statistic: 3.055 on 7 and 360 DF,  p-value: 0.003897
checkresiduals(model8_1)
    Breusch-Godfrey test for serial correlation of order up to 11

data:  Residuals
LM test = 176.14, df = 11, p-value < 2.2e-16

prod8[, residuals1:=model8_1$residuals]
corr <- round(cor(Filter(is.numeric, prod8[!is.na(prod8$price)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

Again, the maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod8$residuals1, prod8$basket_count)

model8_2 <- lm(sold_count~trend+days+shift(basket_count, 1), data=prod8)
summary(model8_2)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1), data = prod8)

Residuals:
    Min      1Q  Median      3Q     Max 
-996.74 -109.80  -31.87   53.41 2246.28 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             153.5586    31.6142   4.857 1.78e-06 ***
trend                    -0.4226     0.1494  -2.828 0.004941 ** 
days.L                 -139.8203    40.4207  -3.459 0.000607 ***
days.Q                  -68.3453    40.9012  -1.671 0.095599 .  
days.C                   74.2299    40.2190   1.846 0.065770 .  
days^4                  -20.9278    40.2686  -0.520 0.603590    
days^5                   55.9545    40.1351   1.394 0.164135    
days^6                   50.4709    39.9942   1.262 0.207786    
shift(basket_count, 1)    0.3182     0.0166  19.166  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 290.8 on 358 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.534, Adjusted R-squared:  0.5236 
F-statistic: 51.29 on 8 and 358 DF,  p-value: < 2.2e-16
checkresiduals(model8_2)
    Breusch-Godfrey test for serial correlation of order up to 12

data:  Residuals
LM test = 24.397, df = 12, p-value = 0.01795

prod8[, residuals2:=c(NA, model8_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod8[!is.na(prod8$price) & !is.na(prod8$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals2 is obtained by price. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod8$residuals2, prod8$price, na.action=na.pass)

model8_3 <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1), data=prod8)
summary(model8_3)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(price, 1), data = prod8)

Residuals:
    Min      1Q  Median      3Q     Max 
-964.16 -109.00  -23.47   60.79 2308.12 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             933.10946  322.80221   2.891 0.004080 ** 
trend                    -0.41970    0.14840  -2.828 0.004945 ** 
days.L                 -137.73128   40.15684  -3.430 0.000675 ***
days.Q                  -84.58368   41.17239  -2.054 0.040667 *  
days.C                   74.47396   39.94734   1.864 0.063100 .  
days^4                  -16.78058   40.03297  -0.419 0.675345    
days^5                   51.08520   39.91434   1.280 0.201422    
days^6                   48.78471   39.73004   1.228 0.220292    
shift(basket_count, 1)    0.29074    0.02001  14.529  < 2e-16 ***
shift(price, 1)         -10.66572    4.39560  -2.426 0.015742 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 288.9 on 357 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5416,    Adjusted R-squared:  0.5301 
F-statistic: 46.87 on 9 and 357 DF,  p-value: < 2.2e-16
checkresiduals(model8_3)
    Breusch-Godfrey test for serial correlation of order up to 13

data:  Residuals
LM test = 30.421, df = 13, p-value = 0.004094

Now, let’s check if the residuals3 is related with the lagged version of the input variable.

prod8[, residuals3:=c(NA, model8_3$fitted.values)]
ccf(prod8$residuals3, prod8$sold_count, na.action=na.pass)

model8_4 <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1)+shift(sold_count, 1), data=prod8)
summary(model8_4)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(price, 1) + shift(sold_count, 1), data = prod8)

Residuals:
    Min      1Q  Median      3Q     Max 
-872.43 -110.85  -26.94   52.75 2455.57 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             762.54803  330.86312   2.305 0.021757 *  
trend                    -0.30664    0.15677  -1.956 0.051244 .  
days.L                 -142.54237   40.01819  -3.562 0.000418 ***
days.Q                  -77.92131   41.08322  -1.897 0.058681 .  
days.C                   79.00352   39.80290   1.985 0.047926 *  
days^4                  -16.05225   39.83359  -0.403 0.687203    
days^5                   53.57546   39.73104   1.348 0.178370    
days^6                   46.26115   39.54820   1.170 0.242888    
shift(basket_count, 1)    0.22331    0.03719   6.005 4.71e-09 ***
shift(price, 1)          -8.60714    4.47743  -1.922 0.055361 .  
shift(sold_count, 1)      0.18147    0.08453   2.147 0.032482 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 287.4 on 356 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5475,    Adjusted R-squared:  0.5348 
F-statistic: 43.07 on 10 and 356 DF,  p-value: < 2.2e-16
checkresiduals(model8_4)
    Breusch-Godfrey test for serial correlation of order up to 14

data:  Residuals
LM test = 28.99, df = 14, p-value = 0.01048

So, the final model is model8_4.

prod8[, preds:=c(NA,model8_4$fitted.values)]
ggplot(prod8, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red") 

Forecasting

forecast_data_mendil <- data.table(event_date=mendil[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=mendil[event_date>=test_start&event_date<=test_end,]$sold_count)
mendil[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
arima_fc <- numeric(0)
arimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- mendil[event_date<test_dates[i],]
  model_arima <- Arima(train_dt$sold_count,order=c(3,1,3))
  model_arimax <- Arima(train_dt$sold_count,order=c(3,1,3),xreg=train_dt$diff_basket)
  model_reg <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1)+shift(sold_count, 1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$diff_basket),h=1)$mean[1]
  arima_temp <- forecast(model_arima)
  arimax_temp <- forecast(model_arimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=mendil[event_date %in% c(test_dates[i],test_dates[i+1])])
  arima_fc <- c(arima_fc,arima_temp$mean[1])
  arimax_fc <- c(arimax_fc,arimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_mendil <- forecast_data_mendil[,`:=`(arima_p=arima_fc,
                                               arimax_p=arimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_mendil$sold_count,forecast_data_mendil$arima_p)
accu(forecast_data_mendil$sold_count,forecast_data_mendil$arimax_p)
accu(forecast_data_mendil[1:(.N-1)]$sold_count,forecast_data_mendil[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 542.1333 323.7585 0.5971935 0.07277015 0.2194611 237.5628 128.2689 0.2366004 0.2366004
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 542.1333 323.7585 0.5971935 0.07918613 0.2462374 240.2728 139.0699 0.2565234 0.2565234
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 564.4286 323.8099 0.5736952 0.02045764 0.1574298 99.59298 79.30958 0.1405131 0.1405131
result_data_mendil <- data.table(event_date=mendil[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=mendil[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- mendil[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1)+shift(sold_count, 1), data=train_dt)
  final_temp <- predict(model_final,newdata=mendil[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_mendil <- result_data_mendil[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_mendil[1:(.N-1)]$sold_count,result_data_mendil[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 644.5 574.7062 0.8917086 0.09331918 0.1565647 128.914 93.02199 0.144332 0.144332
data.frame(event_date=result_data_mendil[1:(.N-1)]$event_date,
           sold_count=result_data_mendil[1:(.N-1)]$sold_count,
           predictions=result_data_mendil[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 236 239.6787
2021-06-13 298 395.1263
2021-06-14 269 325.2178
2021-06-15 381 299.5827
2021-06-16 383 384.2660
2021-06-17 494 347.6472
2021-06-18 2362 1994.6812
2021-06-19 1319 1178.6893
2021-06-20 892 811.0426
2021-06-21 754 733.6070
2021-06-22 364 407.2992
2021-06-23 432 460.5565
2021-06-24 453 331.6753
2021-06-25 386 271.9113

La Roche Posay Yüz Temizleyici

ggplot(prod7, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod7$sold_count)

First three lags and lag 15 have high correlation values. Now let’s decompose the data.

Weekly Decomposition

Because the variance does not increase over time, additive decomposition will be used.

prod7_weekly_dec <- decompose(ts(prod7$sold_count, freq=7), type="additive")
plot(prod7_weekly_dec)

plot(prod7_weekly_dec$random)

Now, let’s check whether the detrended and deseasonalized data is stationary.

u_test <- ur.kpss(prod7_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0085 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the test results, it can be said that the data is stationary.

acf(prod7_weekly_dec$random, na.action=na.pass)
pacf(prod7_weekly_dec$random, na.action=na.pass)

The ACF graph is like a sinus wave. There are still correlatinons at different level of lags.

Monthly Decomposition

prod7_monthly <- prod7[, month:=yearmonth(event_date)]
prod7_monthly[, .(sold_count=sum(sold_count)), by=month]
prod7_monthly[, trend:=1:.N]

prod7_reg <- lm(sold_count~trend+as.factor(month), data=prod7_monthly)
prod7_monthly[, regOutput:=predict(prod7_reg, prod7_monthly)]

prod7_monthly[, random:=sold_count-regOutput]

ggplot(prod7_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 247
2020 Jun 1416
2020 Jul 1125
2020 Aug 1360
2020 Sep 1471
2020 Oct 1349
2020 Nov 3624
2020 Dec 2098
2021 Jan 3671
2021 Feb 2392
2021 Mar 2252
2021 Apr 2925
2021 May 3387

Let’s check the unit test result in order to see whether the data is stationary.

u_test <- ur.kpss(prod7_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0094 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

According to the KPSS test result, the data is stationary.

acf(prod7_monthly$random, na.action=na.pass)
pacf(prod7_monthly$random, na.action=na.pass)

It seems that there is a correlation at lag 1.

Building an ARIMA Model

jel <- data[product_content_id=="85004",][order(event_date)]
jel_train <- jel[event_date<test_start,]
jel_ts <- ts(jel_train$sold_count)
ggplot(jel_train,aes(x=event_date,y=sold_count)) + geom_line() +
labs(title= "Graph of SLa Roche Posay", x= "Date", y="Quantity")

Again, series are not stationariy. Analysis could start with differencing to make it stationary.

ggplot(jel_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(jel_train$sold_count,1))) + 
labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(jel_train$sold_count,1))
pacf(diff(jel_train$sold_count,1))

In the ACF graph there is spikes at lag 3 and 4. Pacf graph shows spikes at lag 3 and 4 also. Decomposition tells there is no seasonality, so an ARIMA model could be constructed.

arima(jel_ts,order=c(3,1,3))
Call:
arima(x = jel_ts, order = c(3, 1, 3))

Coefficients:
         ar1      ar2     ar3      ma1     ma2      ma3
      0.9504  -0.7389  0.3324  -1.1857  0.9067  -0.6940
s.e.  0.1449   0.2401  0.1439   0.1258  0.2309   0.1436

sigma^2 estimated as 1606:  log likelihood = -1876.32,  aic = 3766.64
arima(jel_ts,order=c(4,1,3))
Call:
arima(x = jel_ts, order = c(4, 1, 3))

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2      ma3
      0.8253  -0.6122  0.3456  -0.0752  -1.0495  0.7263  -0.6456
s.e.  0.2207   0.3148  0.1546   0.0784   0.2135  0.3428   0.1760

sigma^2 estimated as 1602:  log likelihood = -1875.84,  aic = 3767.69
arima(jel_ts,order=c(3,1,4))
Call:
arima(x = jel_ts, order = c(3, 1, 4))

Coefficients:
         ar1      ar2     ar3      ma1     ma2      ma3      ma4
      0.7113  -0.5170  0.2363  -0.9379  0.6095  -0.5293  -0.1082
s.e.  0.3671   0.4088  0.2026   0.3661  0.4841   0.2715   0.1382

sigma^2 estimated as 1604:  log likelihood = -1876,  aic = 3767.99
arima(jel_ts,order=c(4,1,4))
Call:
arima(x = jel_ts, order = c(4, 1, 4))

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2      ma3     ma4
      1.2843  -1.0413  0.6887  -0.2443  -1.5108  1.2633  -1.0618  0.3289
s.e.     NaN      NaN     NaN      NaN      NaN     NaN      NaN     NaN

sigma^2 estimated as 1596:  log likelihood = -1875.17,  aic = 3768.34
auto.arima(jel_ts)
Series: jel_ts 
ARIMA(0,1,0) 

sigma^2 estimated as 1933:  log likelihood=-1909.24
AIC=3820.48   AICc=3820.49   BIC=3824.39

The best model is ARIMA(3,1,3) with an AIC of 3766.64. Thus, we could use it in forecast steps.

model_jel_arima <- arima(jel_ts,order=c(3,1,3))
checkresiduals(model_jel_arima)
jel_train$res1 <- model_jel_arima$residuals
corr <- cor(jel_train[!is.na(jel_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(3,1,3)
Q* = 4.6371, df = 4, p-value = 0.3266

Model df: 6.   Total lags used: 10

Residuals have constant mean at 0 and variance looks constant although there is some outliers. There is not lags with correlation. Distribution is slightly right skewed. Residuals are correlated with category_visits.

ggplot(jel_train,aes(x=res1,y=category_visits)) +  geom_point() + geom_smooth()

ggplot(jel_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(jel_train$category_visits,1))) +
geom_smooth(aes(y=diff(jel_train$category_visits,1)))

cor(jel_train[2:.N]$res1,diff(jel_train$category_visits,1))
jel_train$diff_visits <- c(NA,diff(jel_train$category_visits,1))
jel$diff_visits <- c(NA,diff(jel$category_visits,1))

0.696385699753894

Differenced category_visits is highlt correlated with residuals with a value of 0.696. We could add it as a regressor.

arima(jel_ts,order=c(3,1,3),xreg=jel_train$diff_visits)
Call:
arima(x = jel_ts, order = c(3, 1, 3), xreg = jel_train$diff_visits)

Coefficients:
         ar1     ar2      ar3      ma1      ma2     ma3  jel_train$diff_visits
      0.6067  0.3794  -0.3508  -0.6246  -0.5043  0.1567                 0.0096
s.e.  0.3098  0.4221   0.2103   0.3185   0.4393  0.2214                 0.0007

sigma^2 estimated as 1115:  log likelihood = -1804.34,  aic = 3624.68
auto.arima(jel_ts,xreg=jel_train$diff_visits)
Series: jel_ts 
Regression with ARIMA(1,1,0) errors 

Coefficients:
         ar1    xreg
      0.1296  0.0095
s.e.  0.0518  0.0007

sigma^2 estimated as 1321:  log likelihood=-1833.85
AIC=3673.69   AICc=3673.76   BIC=3685.4
model_jel_arimax <- arima(jel_ts,order=c(3,1,3),xreg=jel_train$diff_visits)
checkresiduals(model_jel_arimax)
jel_train$res2 <- model_jel_arimax$residuals
corr <- cor(jel_train[!is.na(jel_train$price)& !is.na(jel_train$res2),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(3,1,3)
Q* = 4.1249, df = 3, p-value = 0.2483

Model df: 7.   Total lags used: 10

Residuals don’t look changed much in the stationary manner. However, we could see its success in the test period.

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod7[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model7_1 <- lm(sold_count~trend+days, data=prod7)
summary(model7_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod7)

Residuals:
   Min     1Q Median     3Q    Max 
-84.38 -31.46 -13.42  10.38 378.87 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  33.15644    6.11910   5.419 1.10e-07 ***
trend         0.22216    0.02874   7.730 1.09e-13 ***
days.L       -2.44024    8.10578  -0.301 0.763550    
days.Q      -26.92477    8.09114  -3.328 0.000966 ***
days.C        1.98732    8.08392   0.246 0.805949    
days^4        6.76780    8.07901   0.838 0.402754    
days^5       -0.87616    8.06181  -0.109 0.913517    
days^6        0.51162    8.04843   0.064 0.949350    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 58.57 on 360 degrees of freedom
Multiple R-squared:  0.1663,    Adjusted R-squared:  0.1501 
F-statistic: 10.26 on 7 and 360 DF,  p-value: 9.871e-12
checkresiduals(model7_1)
    Breusch-Godfrey test for serial correlation of order up to 11

data:  Residuals
LM test = 205.56, df = 11, p-value < 2.2e-16

prod7[, residuals1:=model7_1$residuals]
corr <- round(cor(Filter(is.numeric, prod7[!is.na(prod7$price)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

Again, the maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod7$residuals1, prod7$basket_count)

model7_2 <- lm(sold_count~trend+days+shift(basket_count, 1), data=prod7)
summary(model7_2)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1), data = prod7)

Residuals:
     Min       1Q   Median       3Q      Max 
-136.244  -20.065   -4.351    9.659  255.202 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             20.24619    4.72975   4.281  2.4e-05 ***
trend                   -0.10754    0.02974  -3.616 0.000342 ***
days.L                  -8.48641    6.16460  -1.377 0.169485    
days.Q                 -15.57661    6.17292  -2.523 0.012056 *  
days.C                   7.84887    6.14866   1.277 0.202600    
days^4                   2.88369    6.14785   0.469 0.639314    
days^5                  -4.49943    6.12692  -0.734 0.463203    
days^6                   3.55106    6.10660   0.582 0.561262    
shift(basket_count, 1)   0.21056    0.01285  16.382  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 44.4 on 358 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.523, Adjusted R-squared:  0.5124 
F-statistic: 49.07 on 8 and 358 DF,  p-value: < 2.2e-16
checkresiduals(model7_2)
    Breusch-Godfrey test for serial correlation of order up to 12

data:  Residuals
LM test = 78.179, df = 12, p-value = 9.171e-12

prod7[, residuals2:=c(NA, model7_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod7[!is.na(prod7$price) & !is.na(prod7$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals2 is obtained by price. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod7$residuals2, prod7$price, na.action=na.pass)

model7_3 <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1), data=prod7)
summary(model7_3)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(price, 1), data = prod7)

Residuals:
     Min       1Q   Median       3Q      Max 
-127.088  -17.502   -3.361   11.935  252.851 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            301.58275   41.64373   7.242 2.74e-12 ***
trend                    0.02501    0.03414   0.732 0.464425    
days.L                  -9.88470    5.81266  -1.701 0.089899 .  
days.Q                 -19.56064    5.84633  -3.346 0.000908 ***
days.C                   9.27049    5.79777   1.599 0.110711    
days^4                   2.46919    5.79355   0.426 0.670221    
days^5                  -5.02789    5.77403  -0.871 0.384462    
days^6                   2.25716    5.75751   0.392 0.695264    
shift(basket_count, 1)   0.18102    0.01287  14.068  < 2e-16 ***
shift(price, 1)         -3.81227    0.56105  -6.795 4.56e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 41.84 on 357 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5776,    Adjusted R-squared:  0.567 
F-statistic: 54.25 on 9 and 357 DF,  p-value: < 2.2e-16
checkresiduals(model7_3)
    Breusch-Godfrey test for serial correlation of order up to 13

data:  Residuals
LM test = 46.738, df = 13, p-value = 1.07e-05

Now, let’s check if the residuals3 is related with the lagged version of the input variable.

prod7[, residuals3:=c(NA,model7_3$residuals)]
ccf(prod7$residuals3, prod7$sold_count, na.action=na.pass)

model7_4 <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1)+shift(sold_count,1), data=prod7)
summary(model7_4)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(price, 1) + shift(sold_count, 1), data = prod7)

Residuals:
     Min       1Q   Median       3Q      Max 
-115.852  -16.895   -2.886   11.329  278.090 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            152.45288   45.68858   3.337 0.000937 ***
trend                    0.05147    0.03260   1.579 0.115236    
days.L                 -13.16344    5.52940  -2.381 0.017809 *  
days.Q                 -15.24056    5.57813  -2.732 0.006605 ** 
days.C                  10.83203    5.49732   1.970 0.049565 *  
days^4                   3.22380    5.48926   0.587 0.557380    
days^5                  -3.09814    5.47766  -0.566 0.572025    
days^6                   1.75516    5.45443   0.322 0.747804    
shift(basket_count, 1)   0.06804    0.02130   3.195 0.001524 ** 
shift(price, 1)         -1.89705    0.60835  -3.118 0.001967 ** 
shift(sold_count, 1)     0.47766    0.07383   6.470 3.25e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 39.63 on 356 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.6221,    Adjusted R-squared:  0.6115 
F-statistic:  58.6 on 10 and 356 DF,  p-value: < 2.2e-16
checkresiduals(model7_4)
    Breusch-Godfrey test for serial correlation of order up to 14

data:  Residuals
LM test = 24.864, df = 14, p-value = 0.03593

So, the final model is model7_4.

prod7[, preds:=c(NA,model7_4$fitted.values)]
ggplot(prod7, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red") 

Forecasting

forecast_data_jel <- data.table(event_date=jel[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=jel[event_date>=test_start&event_date<=test_end,]$sold_count)
jel[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
arima_fc <- numeric(0)
arimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- jel[event_date<test_dates[i],]
  model_arima <- Arima(train_dt$sold_count,order=c(3,1,4))
  model_arimax <- Arima(train_dt$sold_count,order=c(3,1,4),xreg=train_dt$diff_visits)
  model_reg <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(price, 1)+shift(sold_count,1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$diff_visits),h=1)$mean[1]
  arima_temp <- forecast(model_arima)
  arimax_temp <- forecast(model_arimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=jel[event_date %in% c(test_dates[i],test_dates[i+1])])
  arima_fc <- c(arima_fc,arima_temp$mean[1])
  arimax_fc <- c(arimax_fc,arimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_jel <- forecast_data_jel[,`:=`(arima_p=arima_fc,
                                               arimax_p=arimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_jel$sold_count,forecast_data_jel$arima_p)
accu(forecast_data_jel$sold_count,forecast_data_jel$arimax_p)
accu(forecast_data_jel[1:(.N-1)]$sold_count,forecast_data_jel[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 81.26667 18.35159 0.2258194 -0.06315798 0.2085848 20.27698 16.63532 0.2047004 0.2047004
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 81.26667 18.35159 0.2258194 -0.01421011 0.2677661 29.46622 20.73637 0.2551646 0.2551646
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 80.85714 18.97309 0.2346495 -0.001894587 0.1369498 14.5462 11.1321 0.1376762 0.1376762
result_data_jel <- data.table(event_date=jel[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=jel[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- jel[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count,1), data=train_dt)
  final_temp <- predict(model_final,newdata=jel[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_jel <- result_data_jel[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_jel[1:(.N-1)]$sold_count,result_data_jel[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 70.57143 15.1643 0.2148788 -0.01595804 0.1160754 9.21143 7.27536 0.1030922 0.1030922
data.frame(event_date=result_data_jel[1:(.N-1)]$event_date,
           sold_count=result_data_jel[1:(.N-1)]$sold_count,
           predictions=result_data_jel[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 56 59.58912
2021-06-13 95 97.14483
2021-06-14 47 70.55064
2021-06-15 78 81.85642
2021-06-16 81 76.29060
2021-06-17 79 65.75714
2021-06-18 57 50.92852
2021-06-19 77 72.34972
2021-06-20 83 86.71792
2021-06-21 76 89.06199
2021-06-22 83 91.88987
2021-06-23 75 73.00430
2021-06-24 51 44.15541
2021-06-25 50 44.47006

Fakir Dik Süpürge

ggplot(prod6, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod6$sold_count)

On the graph above, there is a peak point at lag 16.

Weekly Decomposition

Additive decomposition will be used due to the non-increasing variance.

prod6_weekly_dec <- decompose(ts(prod6$sold_count, freq=7), type="additive")
plot(prod6_weekly_dec)

plot(prod6_weekly_dec$random)

u_test <- ur.kpss(prod6_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0067 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the KPSS test above, it can be said that the data is stationary.

acf(prod6_weekly_dec$random, na.action=na.pass)
pacf(prod6_weekly_dec$random, na.action=na.pass)

There are slightly correlations at different level of lags.

Monthly Decomposition

prod6_monthly <- prod6[, month:=yearmonth(event_date)]
prod6_monthly[, .(sold_count=sum(sold_count)), by=month]
prod6_monthly[, trend:=1:.N]

prod6_reg <- lm(sold_count~trend+as.factor(month), data=prod6_monthly)
prod6_monthly[, regOutput:=predict(prod6_reg, prod6_monthly)]

prod6_monthly[, random:=sold_count-regOutput]

ggplot(prod6_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 315
2020 Jun 1608
2020 Jul 908
2020 Aug 770
2020 Sep 1451
2020 Oct 1850
2020 Nov 2931
2020 Dec 1145
2021 Jan 621
2021 Feb 512
2021 Mar 740
2021 Apr 899
2021 May 970

u_test <- ur.kpss(prod6_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0136 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

The test result says that monthly decomposed data is stationary.

acf(prod6_monthly$random)
pacf(prod6_monthly$random)

The ACF graph is like sinus wave with a peak value at lag 16. Also a high correlation with lag 1 can be seen on the PACF graph.

Building an ARIMA Model

fakir <- data[product_content_id=="7061886",][order(event_date)]
fakir_train <- fakir[event_date<test_start,]
fakir_ts <- ts(fakir_train$sold_count)
ggplot(fakir_train,aes(x=event_date,y=sold_count)) + geom_line() +
labs(title= "Graph of Fakir Vacuum Clenaer", x= "Date", y="Quantity")

Fakir vacuum cleaner data is not stationary, because of the trend and outliers.

ggplot(fakir_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(fakir_train$sold_count,1))) + 
  labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(fakir_train$sold_count,1))
pacf(diff(fakir_train$sold_count,1))

After analyzing the ACF and PACF graphs of the differenced series, AR(1), AR(4), MA(1) and MA(4) models could be tried as a ARIMA model.

arima(fakir_ts,order=c(1,1,1))
Call:
arima(x = fakir_ts, order = c(1, 1, 1))

Coefficients:
          ar1     ma1
      -0.4581  0.2845
s.e.   0.1579  0.1660

sigma^2 estimated as 736.7:  log likelihood = -1732.27,  aic = 3470.55
arima(fakir_ts,order=c(4,1,1))
Call:
arima(x = fakir_ts, order = c(4, 1, 1))

Coefficients:
         ar1     ar2      ar3      ar4      ma1
      0.6143  0.2204  -0.1475  -0.1644  -0.9235
s.e.  0.0572  0.0601   0.0599   0.0542   0.0294

sigma^2 estimated as 619.7:  log likelihood = -1701.18,  aic = 3414.36
arima(fakir_ts,order=c(1,1,4))
Call:
arima(x = fakir_ts, order = c(1, 1, 4))

Coefficients:
         ar1      ma1     ma2      ma3      ma4
      0.3058  -0.6019  0.1564  -0.2170  -0.2528
s.e.  0.1261   0.1239  0.0787   0.0607   0.0734

sigma^2 estimated as 609.8:  log likelihood = -1698.29,  aic = 3408.59
arima(fakir_ts,order=c(4,1,4))
Call:
arima(x = fakir_ts, order = c(4, 1, 4))

Coefficients:
         ar1     ar2      ar3      ar4      ma1     ma2      ma3      ma4
      0.2548  0.1198  -0.0793  -0.0397  -0.5502  0.0220  -0.1103  -0.2626
s.e.  0.4399  0.3897   0.2508   0.1608   0.4383  0.5072   0.3697   0.3029

sigma^2 estimated as 608.8:  log likelihood = -1698,  aic = 3414
auto.arima(fakir_ts)
Series: fakir_ts 
ARIMA(0,1,5) 

Coefficients:
          ma1     ma2      ma3      ma4      ma5
      -0.3007  0.0567  -0.2058  -0.3119  -0.1090
s.e.   0.0511  0.0528   0.0540   0.0519   0.0513

sigma^2 estimated as 618.8:  log likelihood=-1698.48
AIC=3408.96   AICc=3409.19   BIC=3432.39

The best model is ARIMA(1,1,4) with the AIC value of 3408.59.

model_fakir_arima <- arima(fakir_ts,order=c(1,1,4))
checkresiduals(model_fakir_arima)
fakir_train$res1 <- model_fakir_arima$residuals
corr <- cor(fakir_train[!is.na(fakir_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(1,1,4)
Q* = 1.7263, df = 5, p-value = 0.8856

Model df: 5.   Total lags used: 10

Residuals seem stationary with a constant mean of 0 and nearly constant variance. Histogram of the residuals look as a normal distribution. Category_sold attribute is correlated with the residuals.

ggplot(fakir_train,aes(x=res1,y=category_sold)) +  geom_point() + geom_smooth()

ggplot(fakir_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(fakir_train$category_sold,1))) +
geom_smooth(aes(y=diff(fakir_train$category_sold,1)))

cor(fakir_train[2:.N]$res1,diff(fakir_train$category_sold,1))
fakir_train$diff_sold <- c(NA,diff(fakir_train$category_sold,1))
fakir$diff_sold <- c(NA,diff(fakir$category_sold,1))

0.811754849467892

arima(fakir_ts,order=c(1,1,4),xreg=fakir_train$diff_sold)
Call:
arima(x = fakir_ts, order = c(1, 1, 4), xreg = fakir_train$diff_sold)

Coefficients:
         ar1      ma1      ma2      ma3      ma4  fakir_train$diff_sold
      0.5127  -0.4745  -0.0735  -0.1067  -0.2695                 0.1428
s.e.  0.0862   0.0863   0.0565   0.0552   0.0598                 0.0071

sigma^2 estimated as 324.8:  log likelihood = -1578.32,  aic = 3170.64

New regressor give better AIC value which is 3170.64 and better than the previous model.

model_fakir_arimax <- arima(fakir_ts,order=c(1,1,4),xreg=fakir_train$diff_sold)
checkresiduals(model_fakir_arimax)
fakir_train$res2 <- model_fakir_arimax$residuals
corr <- cor(fakir_train[!is.na(fakir_train$price)& !is.na(fakir_train$res2),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(1,1,4)
Q* = 3.4188, df = 4, p-value = 0.4903

Model df: 6.   Total lags used: 10

Residuals are more stationary now. We should apply the new model to the test period.

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod6[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model6_1 <- lm(sold_count~trend+days, data=prod6)
summary(model6_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod6)

Residuals:
    Min      1Q  Median      3Q     Max 
-44.570 -17.669  -8.300   3.405 269.865 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  50.93116    4.05183  12.570  < 2e-16 ***
trend        -0.05948    0.01903  -3.125  0.00192 ** 
days.L       -2.61311    5.36733  -0.487  0.62666    
days.Q      -13.07621    5.35764  -2.441  0.01514 *  
days.C       -0.12958    5.35285  -0.024  0.98070    
days^4        4.99462    5.34960   0.934  0.35111    
days^5        0.57142    5.33822   0.107  0.91481    
days^6        0.56163    5.32936   0.105  0.91613    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 38.78 on 360 degrees of freedom
Multiple R-squared:  0.04466,   Adjusted R-squared:  0.02609 
F-statistic: 2.404 on 7 and 360 DF,  p-value: 0.02045
checkresiduals(model6_1)
    Breusch-Godfrey test for serial correlation of order up to 11

data:  Residuals
LM test = 221.5, df = 11, p-value < 2.2e-16

prod6[, residuals1:=model6_1$residuals]
corr <- round(cor(Filter(is.numeric, prod6)), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

As usual, the maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod6$residuals1, prod6$basket_count)

model6_2 <- lm(sold_count~trend+days+shift(basket_count,1), data=prod6)
summary(model6_2)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1), data = prod6)

Residuals:
     Min       1Q   Median       3Q      Max 
-109.873   -7.746   -1.099    6.125  138.879 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.426447   2.713588   1.263  0.20752    
trend                  -0.030138   0.010329  -2.918  0.00375 ** 
days.L                 -1.460099   2.891712  -0.505  0.61392    
days.Q                 -5.290561   2.894116  -1.828  0.06838 .  
days.C                  2.860463   2.885882   0.991  0.32226    
days^4                  3.629424   2.886902   1.257  0.20950    
days^5                 -3.167226   2.880009  -1.100  0.27219    
days^6                  3.501694   2.869863   1.220  0.22321    
shift(basket_count, 1)  0.285417   0.009591  29.760  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 20.86 on 358 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.725, Adjusted R-squared:  0.7189 
F-statistic:   118 on 8 and 358 DF,  p-value: < 2.2e-16
checkresiduals(model6_2)
    Breusch-Godfrey test for serial correlation of order up to 12

data:  Residuals
LM test = 43.261, df = 12, p-value = 2.04e-05

prod6[, residuals2:=c(NA, model6_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod6[!is.na(prod6$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals2 is obtained by category_sold. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod6$residuals2, prod6$category_sold, na.action=na.pass)

model6_3 <- lm(sold_count~trend+days+shift(basket_count,1)+shift(category_sold,1), data=prod6)
summary(model6_3)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(category_sold, 1), data = prod6)

Residuals:
     Min       1Q   Median       3Q      Max 
-105.922   -7.661   -0.903    5.732  143.119 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              3.38581    2.70567   1.251  0.21162    
trend                   -0.03524    0.01070  -3.295  0.00108 ** 
days.L                  -2.17123    2.91111  -0.746  0.45625    
days.Q                  -4.93560    2.89255  -1.706  0.08882 .  
days.C                   3.28128    2.88719   1.136  0.25651    
days^4                   3.73207    2.87896   1.296  0.19570    
days^5                  -2.98193    2.87341  -1.038  0.30008    
days^6                   3.42658    2.86170   1.197  0.23195    
shift(basket_count, 1)   0.26721    0.01405  19.013  < 2e-16 ***
shift(category_sold, 1)  0.02147    0.01214   1.768  0.07799 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 20.8 on 357 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.7274,    Adjusted R-squared:  0.7206 
F-statistic: 105.9 on 9 and 357 DF,  p-value: < 2.2e-16
checkresiduals(model6_3)
    Breusch-Godfrey test for serial correlation of order up to 13

data:  Residuals
LM test = 47.16, df = 13, p-value = 9.08e-06

The adjusted Rsqr value is slightly increased to 0.7206. Because the increase is too low to add a new attribute to the model, we’ll drop it. Next, let’s check if the residuals2 is related with the lagged version of the input variable.

ccf(prod6$residuals2, prod6$sold_count, na.action=na.pass)

model6_4 <- lm(sold_count~trend+days+shift(basket_count,1)+shift(sold_count,1), data=prod6)
summary(model6_4)
Call:
lm(formula = sold_count ~ trend + days + shift(basket_count, 
    1) + shift(sold_count, 1), data = prod6)

Residuals:
     Min       1Q   Median       3Q      Max 
-107.096   -7.771   -1.095    5.797  142.149 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.30374    2.71442   1.217  0.22437    
trend                  -0.02831    0.01045  -2.711  0.00704 ** 
days.L                 -2.02811    2.93201  -0.692  0.48957    
days.Q                 -5.12895    2.89617  -1.771  0.07742 .  
days.C                  3.09388    2.89164   1.070  0.28537    
days^4                  3.73696    2.88707   1.294  0.19637    
days^5                 -2.99147    2.88271  -1.038  0.30010    
days^6                  3.36127    2.87112   1.171  0.24249    
shift(basket_count, 1)  0.26603    0.01935  13.749  < 2e-16 ***
shift(sold_count, 1)    0.06604    0.05724   1.154  0.24941    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 20.85 on 357 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.7261,    Adjusted R-squared:  0.7192 
F-statistic: 105.1 on 9 and 357 DF,  p-value: < 2.2e-16

The adjusted Rsqr value is slightly decreased to 0.7192. Because it decreases, we’ll drop it. Therefore, the last model is the second one which is model6_2.

prod6[, preds:=c(NA,model6_2$fitted.values)]
ggplot(prod6, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red")

Forecasting

forecast_data_fakir <- data.table(event_date=fakir[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=fakir[event_date>=test_start&event_date<=test_end,]$sold_count)
fakir[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
arima_fc <- numeric(0)
arimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- fakir[event_date<test_dates[i],]
  model_arima <- arima(train_dt$sold_count,order=c(1,1,4))
  model_arimax <- Arima(train_dt$sold_count,order=c(1,1,4),xreg=train_dt$diff_sold)
  model_reg <- lm(sold_count~trend+days+shift(basket_count,1)+shift(sold_count,1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$diff_sold),h=1)$mean[1]
  arima_temp <- forecast(model_arima)
  arimax_temp <- forecast(model_arimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=fakir[event_date %in% c(test_dates[i],test_dates[i+1])])
  arima_fc <- c(arima_fc,arima_temp$mean[1])
  arimax_fc <- c(arimax_fc,arimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_fakir <- forecast_data_fakir[,`:=`(arima_p=arima_fc,
                                               arimax_p=arimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_fakir$sold_count,forecast_data_fakir$arima_p)
accu(forecast_data_fakir$sold_count,forecast_data_fakir$arimax_p)
accu(forecast_data_fakir[1:(.N-1)]$sold_count,forecast_data_fakir[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 20.2 7.598872 0.3761818 -0.1087871 0.3152798 7.225078 5.771497 0.2857177 0.2857177
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 20.2 7.598872 0.3761818 0.544272 1.255578 33.04825 19.53245 0.9669529 0.9669529
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 20.71429 7.609999 0.3673793 0.2171758 0.264644 6.361172 5.228302 0.2524008 0.2524008
result_data_fakir <- data.table(event_date=fakir[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=fakir[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- fakir[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=train_dt)
  final_temp <- predict(model_final,newdata=fakir[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_fakir <- result_data_fakir[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_fakir[1:(.N-1)]$sold_count,result_data_fakir[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 11.85714 5.216205 0.4399209 0.4678085 0.6917984 6.973269 6.248761 0.5270039 0.5270039
result_data_fakir <- data.table(event_date=fakir[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=fakir[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- fakir[event_date<result_dates[i],]
  model_final <- arima(train_dt$sold_count,order=c(1,1,4))
  final_temp <- forecast(model_final)
  final_fc <- c(final_fc,final_temp$mean[1])
  
}
result_data_fakir <- result_data_fakir[,`:=`(final_p=final_fc)]
accu(result_data_fakir$sold_count,result_data_fakir$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 11.6 5.124172 0.441739 -0.2790102 0.8240115 6.241798 5.252488 0.4528007 0.4528007
data.frame(event_date=result_data_fakir$event_date,
           sold_count=result_data_fakir$sold_count,
           predictions=result_data_fakir$final_p)
event_date sold_count predictions
2021-06-12 17 6.3102706
2021-06-13 19 13.5605705
2021-06-14 15 13.3960734
2021-06-15 15 16.7678286
2021-06-16 18 7.0958760
2021-06-17 8 9.1104323
2021-06-18 16 0.2006896
2021-06-19 7 3.4685854
2021-06-20 14 2.3142025
2021-06-21 10 5.8268957
2021-06-22 6 13.1453894
2021-06-23 12 4.7840316
2021-06-24 7 3.8448037
2021-06-25 2 -2.0850536
2021-06-26 8 -3.0865363

Xiaomi Bluetooth Kulaklık

ggplot(prod5, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod5$sold_count)

From the ACF graph, there is a trend. Also, there is a little increase at lag 7, which may lead to weekly seasonality.

Weekly Decomposition

The variance stays same over time, so additive decomposition will be used again.

prod5_weekly_dec <- decompose(ts(prod5$sold_count, freq=7), type="additive")
plot(prod5_weekly_dec)

plot(prod5_weekly_dec$random)

u_test <- ur.kpss(prod5_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0149 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the test result above, we can say that the data is stationary.

acf(prod5_weekly_dec$random, na.action=na.pass)
pacf(prod5_weekly_dec$random, na.action=na.pass)

From the ACF, the high correlation at lag 3 is seen. Also there is a sinus wave on the ACF graph. The high correlation at lag 3 also can be seen on the PACF graph.

Monthly Decomposition

prod5_monthly <- prod5[, month:=yearmonth(event_date)]
prod5_monthly[, .(sold_count=sum(sold_count)), by=month]
prod5_monthly[, trend:=1:.N]

prod5_reg <- lm(sold_count~trend+as.factor(month), data=prod5_monthly)
prod5_monthly[, regOutput:=predict(prod5_reg, prod5_monthly)]

prod5_monthly[, random:=sold_count-regOutput]

ggplot(prod5_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 4515
2020 Jun 12746
2020 Jul 19116
2020 Aug 12981
2020 Sep 7565
2020 Oct 6917
2020 Nov 10553
2020 Dec 11358
2021 Jan 12948
2021 Feb 10668
2021 Mar 8477
2021 Apr 14543
2021 May 12218

u_test <- ur.kpss(prod5_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0178 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

We cannot reject the null hyphothesis which means that the data is stationary.

acf(prod5_monthly$random)
pacf(prod5_monthly$random)

At lag 1, there is correlation. Differencing may be required.

Building an ARIMA Model

xaomi <- data[product_content_id=="6676673",][order(event_date)]
xaomi_train <- xaomi[event_date<test_start,]
xaomi_ts <- ts(xaomi_train$sold_count,freq=7)
ggplot(xaomi_train,aes(x=event_date,y=sold_count)) + geom_line() +
  labs(title= "Graph of Xaomi Bluetooth Earphone", x= "Date", y="Quantity")

Xaomi sold_count series is not stationary due to the non constant mean and variance.

ggplot(xaomi_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(xaomi_train$sold_count,1))) + 
  labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(xaomi_train$sold_count,1))
pacf(diff(xaomi_train$sold_count,1))

At the decomposition step, it was said that the data has weekly seasonality. We could build a SARIMA model. ACF graphs has spikes at lag 1 and 3. Also, PACF graph has spikes at lag 1,3,4 and 9. We could try these parameters on a SARIMA model.

arima(xaomi_ts,order=c(1,1,1),seasonal=c(2,0,0))
Call:
arima(x = xaomi_ts, order = c(1, 1, 1), seasonal = c(2, 0, 0))

Coefficients:
         ar1      ma1    sar1     sar2
      0.6823  -0.9777  0.0661  -0.0317
s.e.  0.0452   0.0165  0.0536   0.0534

sigma^2 estimated as 23482:  log likelihood = -2368.26,  aic = 4746.52
arima(xaomi_ts,order=c(3,1,1),seasonal=c(2,0,0))
Call:
arima(x = xaomi_ts, order = c(3, 1, 1), seasonal = c(2, 0, 0))

Coefficients:
         ar1     ar2      ar3      ma1    sar1     sar2
      0.6384  0.1506  -0.1314  -0.9738  0.0834  -0.0415
s.e.  0.0538  0.0615   0.0535   0.0179  0.0541   0.0538

sigma^2 estimated as 22981:  log likelihood = -2364.31,  aic = 4742.62
arima(xaomi_ts,order=c(3,1,3),seasonal=c(2,0,0))
Call:
arima(x = xaomi_ts, order = c(3, 1, 3), seasonal = c(2, 0, 0))

Coefficients:
          ar1      ar2     ar3      ma1      ma2      ma3    sar1     sar2
      -0.2444  -0.0401  0.4721  -0.0861  -0.1078  -0.7521  0.0788  -0.0335
s.e.   0.1829   0.1376  0.1151   0.1693   0.1505   0.1179  0.0596   0.0563

sigma^2 estimated as 22719:  log likelihood = -2362.28,  aic = 4742.57
arima(xaomi_ts,order=c(1,1,3),seasonal=c(2,0,0))
Call:
arima(x = xaomi_ts, order = c(1, 1, 3), seasonal = c(2, 0, 0))

Coefficients:
         ar1      ma1     ma2      ma3    sar1     sar2
      0.5746  -0.9060  0.1249  -0.1867  0.0801  -0.0388
s.e.  0.1001   0.1081  0.0759   0.0688  0.0540   0.0534

sigma^2 estimated as 22970:  log likelihood = -2364.22,  aic = 4742.45
auto.arima(xaomi_ts)
Series: xaomi_ts 
ARIMA(3,0,0)(0,0,1)[7] with non-zero mean 

Coefficients:
         ar1     ar2      ar3    sma1      mean
      0.6566  0.1582  -0.1179  0.0979  393.0930
s.e.  0.0518  0.0617   0.0526  0.0549   28.3416

sigma^2 estimated as 23169:  log likelihood=-2369.37
AIC=4750.74   AICc=4750.97   BIC=4774.18

The best model is SARIMA(1,1,3)(2,0,0) with the AIC value of 4742.45.

model_xaomi_sarima <- arima(xaomi_ts,order=c(1,1,3),seasonal=c(2,0,0))
checkresiduals(model_xaomi_sarima)
xaomi_train$res1 <- model_xaomi_sarima$residuals
corr <- cor(xaomi_train[!is.na(xaomi_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(1,1,3)(2,0,0)[7]
Q* = 7.85, df = 8, p-value = 0.4483

Model df: 6.   Total lags used: 14

Residuals have a constant mean at 0. They are not correlated and they have nearly constant variance. Although the distribution seems normal, histogram is right skewed. Residuals are correlated with category_sold.

ggplot(xaomi_train,aes(x=res1,y=category_sold)) +  geom_point() + geom_smooth()

cor(xaomi_train$res1,xaomi_train$category_sold)

0.556863493199146

The correlation seems to be linear with 0.544, there is no need to transform the data. We could add it as a regressor.

model_xaomi_sarimax <- arima(xaomi_ts,order=c(1,1,3),seasonal=c(2,0,0),xreg=xaomi_train$category_sold)
checkresiduals(model_xaomi_sarimax)
xaomi_train$res2 <- model_xaomi_sarimax$residuals
corr <- cor(xaomi_train[!is.na(xaomi_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(1,1,3)(2,0,0)[7]
Q* = 9.7366, df = 7, p-value = 0.204

Model df: 7.   Total lags used: 14

Residuals seems more stationart and also right skew of the histogram is gone. We could use this SARIMAX model in the forecasting.

Linear Models

As we did above, firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod5[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model5_1 <- lm(sold_count~trend+days, data=prod5)
summary(model5_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod5)

Residuals:
    Min      1Q  Median      3Q     Max 
-368.91 -153.74  -48.38  111.42 1422.51 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 428.2984    22.8005  18.785   <2e-16 ***
trend        -0.1927     0.1071  -1.800   0.0727 .  
days.L       21.9522    30.2032   0.727   0.4678    
days.Q      -71.9353    30.1486  -2.386   0.0175 *  
days.C        0.3802    30.1217   0.013   0.9899    
days^4        0.9028    30.1034   0.030   0.9761    
days^5       13.5253    30.0394   0.450   0.6528    
days^6       12.3026    29.9895   0.410   0.6819    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 218.2 on 360 degrees of freedom
Multiple R-squared:  0.02657,   Adjusted R-squared:  0.007643 
F-statistic: 1.404 on 7 and 360 DF,  p-value: 0.2025

The p value of the model is high. The model is meaningless. Trend and days are not significant for predicting sold_count of prod5.

corr <- round(cor(Filter(is.numeric, prod5)), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with sold_count is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod5$sold_count, prod5$basket_count)

model5_2 <- lm(sold_count~shift(basket_count,1), data=prod5)
summary(model5_2)
Call:
lm(formula = sold_count ~ shift(basket_count, 1), data = prod5)

Residuals:
    Min      1Q  Median      3Q     Max 
-599.36  -95.60  -20.52   76.95 1230.14 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            109.51362   19.65421   5.572  4.9e-08 ***
shift(basket_count, 1)   0.18288    0.01137  16.090  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 167.9 on 365 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.4149,    Adjusted R-squared:  0.4133 
F-statistic: 258.9 on 1 and 365 DF,  p-value: < 2.2e-16
checkresiduals(model5_2)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 58.421, df = 10, p-value = 7.203e-09

prod5[, residuals2:=c(NA, model5_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod5[!is.na(prod5$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

Price may improve the model. Let’s try to add it to the model.

ccf(prod5$residuals2, prod5$price, na.action=na.pass)

model5_3 <- lm(sold_count~shift(basket_count,1)+shift(price,1), data=prod5)
summary(model5_3)
Call:
lm(formula = sold_count ~ shift(basket_count, 1) + shift(price, 
    1), data = prod5)

Residuals:
    Min      1Q  Median      3Q     Max 
-525.98  -96.86  -18.84   66.54 1206.46 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            588.06073  114.54065   5.134 4.63e-07 ***
shift(basket_count, 1)   0.15925    0.01243  12.809  < 2e-16 ***
shift(price, 1)         -3.22318    0.76054  -4.238 2.86e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 164.2 on 364 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.4425,    Adjusted R-squared:  0.4394 
F-statistic: 144.4 on 2 and 364 DF,  p-value: < 2.2e-16
checkresiduals(model5_3)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 46.235, df = 10, p-value = 1.3e-06

In the next step, let’s check if the residuals3 is related with the lagged version of the input variable.

prod5[, residuals3:=c(NA, model5_3$residuals)]
ccf(prod5$residuals3, prod5$sold_count, na.action=na.pass)

model5_4 <- lm(sold_count~shift(basket_count,1)+shift(price,1)+shift(sold_count,1), data=prod5)
summary(model5_4)
Call:
lm(formula = sold_count ~ shift(basket_count, 1) + shift(price, 
    1) + shift(sold_count, 1), data = prod5)

Residuals:
    Min      1Q  Median      3Q     Max 
-704.47  -75.12  -24.16   50.26 1121.56 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            330.16838  111.94924   2.949  0.00339 ** 
shift(basket_count, 1)   0.03013    0.02064   1.460  0.14513    
shift(price, 1)         -1.53664    0.74230  -2.070  0.03915 *  
shift(sold_count, 1)     0.57580    0.07620   7.556 3.42e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 152.8 on 363 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5182,    Adjusted R-squared:  0.5143 
F-statistic: 130.2 on 3 and 363 DF,  p-value: < 2.2e-16
checkresiduals(model5_4)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 16.264, df = 10, p-value = 0.09233

The final model is model5_4 with an adjusted Rsqr value of 0.52.

prod5[, preds:=c(NA,model5_4$fitted.values)]
ggplot(prod5, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red")

Forecasting

forecast_data_xaomi <- data.table(event_date=xaomi[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=xaomi[event_date>=test_start&event_date<=test_end,]$sold_count)
xaomi[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
sarima_fc <- numeric(0)
sarimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- xaomi[event_date<test_dates[i],]
  model_sarima <- Arima(train_dt$sold_count,order=c(1,1,3),seasonal=c(2,0,0))
  model_sarimax <- Arima(train_dt$sold_count,order=c(1,1,3),seasonal=c(2,0,0),xreg=train_dt$category_sold)
  model_reg <- lm(sold_count~shift(basket_count,1)+shift(price,1)+shift(sold_count,1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$category_sold),h=1)$mean[1]
  sarima_temp <- forecast(model_sarima)
  sarimax_temp <- forecast(model_sarimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=xaomi[event_date %in% c(test_dates[i],test_dates[i+1])])
  sarima_fc <- c(sarima_fc,sarima_temp$mean[1])
  sarimax_fc <- c(sarimax_fc,sarimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_xaomi <- forecast_data_xaomi[,`:=`(sarima_p=sarima_fc,
                                               sarimax_p=sarimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_xaomi$sold_count,forecast_data_xaomi$sarima_p)
accu(forecast_data_xaomi$sold_count,forecast_data_xaomi$sarimax_p)
accu(forecast_data_xaomi[1:(.N-1)]$sold_count,forecast_data_xaomi[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 438.6667 100.6909 0.2295386 0.026259 0.1372231 72.35346 60.25166 0.1373518 0.1373518
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 438.6667 100.6909 0.2295386 -0.01782096 0.270721 153.6741 117.3178 0.2674419 0.2674419
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 440.2857 104.2891 0.2368669 -0.02021158 0.06922325 30.03414 25.85974 0.05873399 0.05873399
result_data_xaomi <- data.table(event_date=xaomi[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=xaomi[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- xaomi[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=train_dt)
  final_temp <- predict(model_final,newdata=xaomi[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_xaomi <- result_data_xaomi[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_xaomi[1:(.N-1)]$sold_count,result_data_xaomi[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 507 133.3722 0.2630616 0.09476509 0.08709704 60.34427 48.77248 0.09619818 0.09619818
data.frame(event_date=result_data_xaomi[1:(.N-1)]$event_date,
           sold_count=result_data_xaomi[1:(.N-1)]$sold_count,
           predictions=result_data_xaomi[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 392 330.1629
2021-06-13 500 472.0971
2021-06-14 557 513.5997
2021-06-15 508 472.9138
2021-06-16 702 619.5400
2021-06-17 530 441.5001
2021-06-18 620 514.9317
2021-06-19 614 510.7500
2021-06-20 658 590.8018
2021-06-21 455 445.9249
2021-06-22 597 549.1724
2021-06-23 425 430.0860
2021-06-24 274 270.1768
2021-06-25 266 263.7002

Trendyolmilla Tayt

ggplot(prod4, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod4$sold_count) 

There is a little increase in the autocorrelation value at lag 16.

Weekly Decomposition

The variance does not increase over time, so additive decomposition will be used to decompose the data at weekly level.

prod4_weekly_dec <- decompose(ts(prod4$sold_count, freq=7), type="multiplicative")
plot(prod4_weekly_dec)

plot(prod4_weekly_dec$random)

u_test <- ur.kpss(prod4_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.1182 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

The weekly decomposed data is stationary.

acf(prod4_weekly_dec$random, na.action=na.pass)
pacf(prod4_weekly_dec$random, na.action=na.pass)

ACF graph is like sinus shaped. First two lag have high correlation values.

Monthly Decomposition

prod4_monthly <- prod4[, month:=yearmonth(event_date)]
prod4_monthly[, .(sold_count=sum(sold_count)), by=month]
prod4_monthly[, trend:=1:.N]

prod4_reg <- lm(sold_count~trend+as.factor(month), data=prod4_monthly)
prod4_monthly[, regOutput:=predict(prod4_reg, prod4_monthly)]

prod4_monthly[, random:=sold_count-regOutput]

ggplot(prod4_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 4928
2020 Jun 17698
2020 Jul 12295
2020 Aug 14991
2020 Sep 22929
2020 Oct 35979
2020 Nov 70576
2020 Dec 38455
2021 Jan 23647
2021 Feb 20564
2021 Mar 29121
2021 Apr 15669
2021 May 11086

Now, let’s check whether the detrended and deseasonalized data is stationary.

u_test <- ur.kpss(prod4_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0087 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the test above, the data is stationary.

acf(prod4_monthly$random, na.action=na.pass)
pacf(prod4_monthly$random, na.action=na.pass)

The data is highly correlated with its first lag. Also, the ACF graph is like sinus wave.

Building an ARIMA Model

tayt <- data[product_content_id=="31515569",][order(event_date)]
tayt_train <- tayt[event_date<test_start,]
tayt_ts <- ts(tayt_train$sold_count)
ggplot(tayt_train,aes(x=event_date,y=sold_count)) + geom_line() +
  labs(title= "Graph of Trendyolmilla Tayt", x= "Date", y="Quantity")

ggplot(tayt_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(tayt_train$sold_count,1))) + 
  labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(tayt_train$sold_count,1))
pacf(diff(tayt_train$sold_count,1))

After the differencing, there is spikes at ACF graph at lag 4 and at PACF graph lag 4. An ARIMA(4,1,4) model could be constructed.

arima(tayt_ts,order=c(4,1,4))
Call:
arima(x = tayt_ts, order = c(4, 1, 4))

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2      ma3      ma4
      0.4575  -0.1596  0.0707  -0.1198  -0.6399  0.1546  -0.1995  -0.2459
s.e.  0.2703   0.3335  0.3074   0.1587   0.2717  0.3819   0.3527   0.2362

sigma^2 estimated as 466866:  log likelihood = -2917.11,  aic = 5852.23
auto.arima(tayt_ts)
Series: tayt_ts 
ARIMA(0,0,4) with non-zero mean 

Coefficients:
         ma1     ma2     ma3     ma4      mean
      0.8162  0.7191  0.5786  0.1516  863.9776
s.e.  0.0509  0.0583  0.0576  0.0520  116.0335

sigma^2 estimated as 474643:  log likelihood=-2925.24
AIC=5862.48   AICc=5862.71   BIC=5885.93

The best model is ARIMA(4,1,4) with lowest AIC 5852.23.

model_tayt_arima <- arima(tayt_ts,order=c(4,1,4))
checkresiduals(model_tayt_arima)
tayt_train$res1 <- model_tayt_arima$residuals
corr <- cor(tayt_train[!is.na(tayt_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(4,1,4)
Q* = 3.2231, df = 3, p-value = 0.3585

Model df: 8.   Total lags used: 11

Residuals have constant mean at 0 and constant variance. Also, distribution seems normal while there is no autocorrelation. Residuals are correlated with basket_count and category_sold. First we could add the basket_count variable.

ggplot(tayt_train,aes(x=res1,y=basket_count)) +  geom_point() + geom_smooth()

ggplot(tayt_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(tayt_train$basket_count,1))) + 
geom_smooth(aes(y=diff(tayt_train$basket_count,1)))

A more linear curve obtained with differencing.

cor(tayt_train[2:.N]$res1,diff(tayt_train$basket_count,1))
tayt_train$diff_basket <- c(NA,diff(tayt_train$basket_count,1))
tayt$diff_basket <- c(NA,diff(tayt$basket_count,1))

0.443486892907098

arima(tayt_ts,order=c(4,1,4),xreg=tayt_train$diff_basket)
Call:
arima(x = tayt_ts, order = c(4, 1, 4), xreg = tayt_train$diff_basket)

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2      ma3      ma4
      0.4499  -0.1336  0.0611  -0.1210  -0.6301  0.1220  -0.1764  -0.2464
s.e.  0.2448   0.3055  0.2849   0.1522   0.2454  0.3469   0.3234   0.2166
      tayt_train$diff_basket
                      0.0253
s.e.                  0.0110

sigma^2 estimated as 461211:  log likelihood = -2906.93,  aic = 5833.85
model_tayt_arimax <- arima(tayt_ts,order=c(4,1,4),xreg=tayt_train$diff_basket)
checkresiduals(model_tayt_arimax)
tayt_train$res2 <- model_tayt_arimax$residuals
corr <- cor(tayt_train[!is.na(tayt_train$price) & !is.na(tayt_train$res2),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(4,1,4)
Q* = 4.5512, df = 3, p-value = 0.2078

Model df: 9.   Total lags used: 12

ggplot(tayt_train,aes(x=res2,y=category_sold)) +  geom_point() + geom_smooth()

ggplot(tayt_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(tayt_train$category_sold,1))) + geom_smooth(aes(y=diff(tayt_train$category_sold,1)))

cor(tayt_train[2:.N]$res2,diff(tayt_train$category_sold,1))
tayt_train$diff_sold <- c(NA,diff(tayt_train$category_sold,1))
tayt$diff_sold <- c(NA,diff(tayt$category_sold,1))

0.846604024659265

arima(tayt_ts,order=c(4,1,4),xreg=as.matrix(tayt_train$diff_basket,tayt_train$diff_sold))
Call:
arima(x = tayt_ts, order = c(4, 1, 4), xreg = as.matrix(tayt_train$diff_basket, 
    tayt_train$diff_sold))

Coefficients:
         ar1      ar2     ar3      ar4      ma1     ma2      ma3      ma4
      0.4499  -0.1336  0.0611  -0.1210  -0.6301  0.1220  -0.1764  -0.2464
s.e.  0.2448   0.3055  0.2849   0.1522   0.2454  0.3469   0.3234   0.2166
      as.matrix(tayt_train$diff_basket, tayt_train$diff_sold)
                                                       0.0253
s.e.                                                   0.0110

sigma^2 estimated as 461211:  log likelihood = -2906.93,  aic = 5833.85
model_tayt_arimax <- arima(tayt_ts,order=c(4,1,4),xreg=as.matrix(tayt_train$diff_basket,tayt_train$diff_sold))
checkresiduals(model_tayt_arimax)
tayt_train$res3 <- model_tayt_arimax$residuals
corr <- cor(tayt_train[!is.na(tayt_train$price) & !is.na(tayt_train$res2),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(4,1,4)
Q* = 4.5512, df = 3, p-value = 0.2078

Model df: 9.   Total lags used: 12

The final model has basket_count and category_sold as regressors. And residuals seem better both in stationarity and distribution manners.

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod4[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model4_1 <- lm(sold_count~trend+days, data=prod4)
summary(model4_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod4)

Residuals:
    Min      1Q  Median      3Q     Max 
 -932.1  -520.5  -256.1    88.4 10018.6 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  847.33195  114.68451   7.388 1.05e-12 ***
trend          0.08136    0.53868   0.151  0.88004    
days.L       -42.67460  151.91906  -0.281  0.77895    
days.Q      -477.57912  151.64471  -3.149  0.00177 ** 
days.C        -0.77239  151.50931  -0.005  0.99594    
days^4        97.79199  151.41736   0.646  0.51879    
days^5       106.03586  151.09509   0.702  0.48327    
days^6        38.51719  150.84424   0.255  0.79860    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1098 on 360 degrees of freedom
Multiple R-squared:  0.02968,   Adjusted R-squared:  0.01081 
F-statistic: 1.573 on 7 and 360 DF,  p-value: 0.1421

Because the p value of the model is higher than 0.05, we can say that the model is meaningless. Let’s check for other regressors.

corr <- round(cor(Filter(is.numeric, prod4)), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with sold_count is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod4$sold_count, prod4$basket_count)

model4_2 <- lm(sold_count~shift(basket_count,1), data=prod4)
summary(model4_2)
Call:
lm(formula = sold_count ~ shift(basket_count, 1), data = prod4)

Residuals:
    Min      1Q  Median      3Q     Max 
-4219.3  -188.7    11.5   101.8  5608.1 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -68.29416   56.49884  -1.209    0.228    
shift(basket_count, 1)   0.22272    0.01004  22.173   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 722.4 on 365 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5739,    Adjusted R-squared:  0.5728 
F-statistic: 491.7 on 1 and 365 DF,  p-value: < 2.2e-16
checkresiduals(model4_2)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 94.771, df = 10, p-value = 6.033e-16

prod4[, residuals2:=c(NA, model4_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod4[!is.na(prod4$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals2 is obtained by category_sold. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod4$residuals2, prod4$category_sold, na.action=na.pass)

model4_3 <- lm(sold_count~shift(basket_count,1)+shift(category_sold,1), data=prod4)
summary(model4_3)
Call:
lm(formula = sold_count ~ shift(basket_count, 1) + shift(category_sold, 
    1), data = prod4)

Residuals:
    Min      1Q  Median      3Q     Max 
-3578.8  -192.2    -1.5    98.6  6455.5 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -73.75607   54.61245  -1.351    0.178    
shift(basket_count, 1)    0.14383    0.01807   7.960 2.21e-14 ***
shift(category_sold, 1)   0.18746    0.03621   5.177 3.75e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 698.1 on 364 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.6031,    Adjusted R-squared:  0.601 
F-statistic: 276.6 on 2 and 364 DF,  p-value: < 2.2e-16
checkresiduals(model4_3)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 75.003, df = 10, p-value = 4.752e-12

Next, let’s check if the residuals3 is related with the lagged version of the input variable.

prod4[, residuals3:=c(NA, model4_3$fitted.values)]
ccf(prod4$residuals3, prod4$sold_count, na.action=na.pass)

model4_4 <- lm(sold_count~shift(basket_count,1)+shift(category_sold,1)+shift(sold_count,1), data=prod4)
summary(model4_4)
Call:
lm(formula = sold_count ~ shift(basket_count, 1) + shift(category_sold, 
    1) + shift(sold_count, 1), data = prod4)

Residuals:
    Min      1Q  Median      3Q     Max 
-4414.4  -146.7   -20.2    68.7  6529.8 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)              9.06362   56.49317   0.160    0.873    
shift(basket_count, 1)   0.12688    0.01804   7.032 1.02e-11 ***
shift(category_sold, 1) -0.04264    0.06302  -0.677    0.499    
shift(sold_count, 1)     0.46274    0.10495   4.409 1.37e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 681.1 on 363 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.6233,    Adjusted R-squared:  0.6202 
F-statistic: 200.2 on 3 and 363 DF,  p-value: < 2.2e-16
checkresiduals(model4_4)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 61.802, df = 10, p-value = 1.65e-09

Let’s try to add 4 lagged sold_count to the model.

model4_5 <- lm(sold_count~shift(basket_count,1)+shift(category_sold,1)+shift(sold_count,1)+shift(sold_count,4), data=prod4)
summary(model4_5)
Call:
lm(formula = sold_count ~ shift(basket_count, 1) + shift(category_sold, 
    1) + shift(sold_count, 1) + shift(sold_count, 4), data = prod4)

Residuals:
    Min      1Q  Median      3Q     Max 
-2789.0  -181.1   -66.6    97.7  6129.7 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             137.07419   57.19277   2.397   0.0171 *  
shift(basket_count, 1)    0.14827    0.01744   8.500 5.16e-16 ***
shift(category_sold, 1)  -0.11207    0.06077  -1.844   0.0660 .  
shift(sold_count, 1)      0.57244    0.10104   5.666 3.01e-08 ***
shift(sold_count, 4)     -0.21507    0.03268  -6.582 1.65e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 646.7 on 359 degrees of freedom
  (4 observations deleted due to missingness)
Multiple R-squared:  0.6635,    Adjusted R-squared:  0.6597 
F-statistic: 176.9 on 4 and 359 DF,  p-value: < 2.2e-16
checkresiduals(model4_5)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 29.734, df = 10, p-value = 0.0009469

So, the final model is model4_5.

prod4[, preds:=c(NA,NA,NA,NA,model4_5$fitted.values)]
ggplot(prod4, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red") 

Forecasting

forecast_data_tayt <- data.table(event_date=tayt[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=tayt[event_date>=test_start&event_date<=test_end,]$sold_count)
tayt[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
arima_fc <- numeric(0)
arimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- tayt[event_date<test_dates[i],]
  model_arima <- Arima(train_dt$sold_count,order=c(4,1,4))
  model_arimax <- Arima(tayt_ts,order=c(4,1,4),xreg=as.matrix(tayt_train$diff_basket,tayt_train$diff_sold))
  model_reg <- lm(sold_count~shift(basket_count,1)+shift(category_sold,1)+shift(sold_count,1)+shift(sold_count,1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$diff_basket),h=1)$mean[1]
  newreg2 <- forecast(auto.arima(train_dt$diff_sold),h=1)$mean[1]
  arima_temp <- forecast(model_arima)
  arimax_temp <- forecast(model_arimax,xreg=as.matrix(newreg,newreg2))
  reg_temp <- predict(model_reg,newdata=tayt[event_date %in% c(test_dates[i],test_dates[i+1])])
  arima_fc <- c(arima_fc,arima_temp$mean[1])
  arimax_fc <- c(arimax_fc,arimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_tayt <- forecast_data_tayt[,`:=`(arima_p=arima_fc,
                                               arimax_p=arimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]

accu(forecast_data_tayt$sold_count,forecast_data_tayt$arima_p)
accu(forecast_data_tayt$sold_count,forecast_data_tayt$arimax_p)
accu(forecast_data_tayt[1:(.N-1)]$sold_count,forecast_data_tayt[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 400.9333 228.3207 0.569473 -0.09140315 0.4765884 166.1847 128.8698 0.3214246 0.3214246
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 400.9333 228.3207 0.569473 0.1035967 0.5978989 227.2729 193.4942 0.4826095 0.4826095
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 415.3571 229.7382 0.55311 0.2867232 0.3270751 153.0789 123.5166 0.2973744 0.2973744
result_data_tayt <- data.table(event_date=tayt[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=tayt[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- tayt[event_date<result_dates[i],]
  model_final <- lm(sold_count~shift(basket_count,1)+shift(category_sold,1)+shift(sold_count,1)+shift(sold_count,1), data=train_dt)
  final_temp <- predict(model_final,newdata=tayt[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_tayt <- result_data_tayt[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_tayt[1:(.N-1)]$sold_count,result_data_tayt[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 252.5 71.95485 0.2849697 0.06086747 0.1971049 49.10554 46.75037 0.18515 0.18515
data.frame(event_date=result_data_tayt[1:(.N-1)]$event_date,
           sold_count=result_data_tayt[1:(.N-1)]$sold_count,
           predictions=result_data_tayt[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 144 101.3098
2021-06-13 179 117.1697
2021-06-14 218 180.0986
2021-06-15 245 193.8707
2021-06-16 244 195.2337
2021-06-17 200 162.7112
2021-06-18 191 153.5335
2021-06-19 315 344.4629
2021-06-20 387 344.3593
2021-06-21 390 314.8777
2021-06-22 224 284.4709
2021-06-23 269 331.3431
2021-06-24 265 318.0161
2021-06-25 264 278.3763

Trendyolmilla Bikini 2

ggplot(prod3, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod3$sold_count) 

The autocorrelation values decrease. There is a peak value at lag 9.

Weekly Decomposition

Because the variance does not regulary increase, additive decomposition will be used.

prod3_weekly_dec <- decompose(ts(prod3$sold_count, freq=7), type="additive")
plot(prod3_weekly_dec)

plot(prod3_weekly_dec$random)

u_test <- ur.kpss(prod3_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0067 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

Even though the variance changes over time, surprisingly, the test result says that the decomposed data is stationary.

acf(prod3_weekly_dec$random, na.action=na.pass)
pacf(prod3_weekly_dec$random, na.action=na.pass)

At lag 2 and 6, there is relatively high correlations seen by the PACF graph.

Monthly Decomposition

prod3_monthly <- prod3[, month:=yearmonth(event_date)]
prod3_monthly[, .(sold_count=sum(sold_count)), by=month]
prod3_monthly[, trend:=1:.N]

prod3_reg <- lm(sold_count~trend+as.factor(month), data=prod3_monthly)
prod3_monthly[, regOutput:=predict(prod3_reg, prod3_monthly)]

prod3_monthly[, random:=sold_count-regOutput]

ggplot(prod3_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 266
2020 Jun 494
2020 Jul 24
2020 Aug 0
2020 Sep 0
2020 Oct 0
2020 Nov 1
2020 Dec 1
2021 Jan 0
2021 Feb 279
2021 Mar 917
2021 Apr 796
2021 May 1125

u_test <- ur.kpss(prod3_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0279 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

According to the test result, we cannot reject the null hyphothesis. The decomposed data is stationary.

acf(prod3_monthly$random, na.action=na.pass)
pacf(prod3_monthly$random, na.action=na.pass)

From the graphs above, it can be said that the data is autocorrelated with its first lag.

Building an ARIMA Model

bikini2 <- data[product_content_id=="32737302",][order(event_date)]
bikini2$is_summer <- 0
bikini2[month(bikini2$event_date) %in% c(6,7,8),]$is_summer <- 1
bikini2_train <- bikini2[event_date<test_start,]
bikini2_ts <- ts(bikini2_train$sold_count,freq=9)
ggplot(bikini2_train,aes(x=event_date,y=sold_count)) + geom_line() +
  labs(title= "Graph of Trendyolmilla Bikini 2", x= "Date", y="Quantity")

ggplot(bikini2_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(bikini2_train$sold_count,1))) + 
  labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

After the differencing, series seem more stationary now. According to the decomposition, there is a seasonal cycle at every 9 data points.

acf(diff(bikini2_train$sold_count,1))
pacf(diff(bikini2_train$sold_count,1))

After the analysis of ACF and PACF graphs, we could say that AR(1), AR(2), AR(3) and seasonal AR(1)-AR(2) could be tried. And also MA(2), MA(3) and seasonal MA(1)-MA(2) are strong candidates for a SARIMA model.

arima(bikini2_ts,order=c(1,1,2),seasonal=c(2,0,2))
Call:
arima(x = bikini2_ts, order = c(1, 1, 2), seasonal = c(2, 0, 2))

Coefficients:
         ar1      ma1      ma2    sar1     sar2     sma1     sma2
      0.4973  -0.7587  -0.0977  0.9484  -0.0892  -0.6536  -0.1968
s.e.  0.1099   0.1141   0.0772  0.3528   0.2148   0.3543   0.2514

sigma^2 estimated as 41.76:  log likelihood = -1206.21,  aic = 2428.42
arima(bikini2_ts,order=c(1,1,3),seasonal=c(2,0,2))
Call:
arima(x = bikini2_ts, order = c(1, 1, 3), seasonal = c(2, 0, 2))

Coefficients:
          ar1     ma1      ma2      ma3    sar1    sar2     sma1     sma2
      -0.2620  0.0335  -0.2677  -0.3652  0.8342  0.0368  -0.5621  -0.3158
s.e.   0.1306  0.1243   0.0531   0.0699  0.2916  0.2454   0.2892   0.2627

sigma^2 estimated as 40.74:  log likelihood = -1201.8,  aic = 2421.6
arima(bikini2_ts,order=c(3,1,3),seasonal=c(2,0,2))
Call:
arima(x = bikini2_ts, order = c(3, 1, 3), seasonal = c(2, 0, 2))

Coefficients:
          ar1      ar2     ar3     ma1      ma2      ma3    sar1     sar2
      -0.6159  -0.2078  0.4195  0.4067  -0.1174  -0.8729  0.9132  -0.0851
s.e.   0.0701   0.0879  0.0694  0.0428   0.0526   0.0432  0.3979   0.2365
         sma1     sma2
      -0.6152  -0.2018
s.e.   0.4014   0.2921

sigma^2 estimated as 38.24:  log likelihood = -1193.01,  aic = 2408.02
arima(bikini2_ts,order=c(3,1,3),seasonal=c(1,0,1))
Call:
arima(x = bikini2_ts, order = c(3, 1, 3), seasonal = c(1, 0, 1))

Coefficients:
          ar1      ar2     ar3     ma1      ma2      ma3     sar1    sma1
      -0.6180  -0.2056  0.4218  0.4062  -0.1219  -0.8754  -0.0187  0.3176
s.e.   0.0705   0.0886  0.0693  0.0441   0.0533   0.0448   0.1941  0.1860

sigma^2 estimated as 38.34:  log likelihood = -1193.43,  aic = 2404.86
auto.arima(bikini2_ts)
Series: bikini2_ts 
ARIMA(2,1,1)(1,0,0)[9] 

Coefficients:
         ar1      ar2      ma1    sar1
      0.6124  -0.0596  -0.8741  0.2709
s.e.  0.0678   0.0595   0.0463  0.0531

sigma^2 estimated as 42.58:  log likelihood=-1207.61
AIC=2425.23   AICc=2425.4   BIC=2444.76

The best model is SARIMA(3,1,3)(1,0,1) with AIC value 2404.86.

model_bikini2_sarima <- arima(bikini2_ts,order=c(3,1,3),seasonal=c(1,0,1))
checkresiduals(model_bikini2_sarima)
bikini2_train$res1 <- model_bikini2_sarima$residuals
corr <- cor(bikini2_train[!is.na(bikini2_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(3,1,3)(1,0,1)[9]
Q* = 12.761, df = 10, p-value = 0.2373

Model df: 8.   Total lags used: 18

Residuals have constant mean at 0 and constant variance. Distribution seems normal. Due to the correlation with residuals, basket_count could be added as a regressor.

ggplot(bikini2_train,aes(x=res1,y=basket_count)) +  geom_point() + geom_smooth()

ggplot(bikini2_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(bikini2_train$basket_count,1))) + 
geom_smooth(aes(y=diff(bikini2_train$basket_count,1)))

cor(bikini2_train[2:.N]$res1,diff(bikini2_train$basket_count,1))
bikini2_train$diff_basket <- c(NA,diff(bikini2_train$basket_count,1))
bikini2$diff_basket <- c(NA,diff(bikini2$basket_count,1))

0.56240554985095

model_bikini2_arimax <- arima(bikini2_ts,order=c(3,1,3),seasonal=c(1,0,1),xreg=bikini2_train$diff_basket)
checkresiduals(model_bikini2_arimax)
bikini2_train$res2 <- model_bikini2_arimax$residuals
corr <- cor(bikini2_train[!is.na(bikini2_train$price) & !is.na(bikini2_train$res2),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(3,1,3)(1,0,1)[9]
Q* = 15.515, df = 9, p-value = 0.07773

Model df: 9.   Total lags used: 18

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod3[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model3_1 <- lm(sold_count~trend+days, data=prod3)
summary(model3_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod3)

Residuals:
    Min      1Q  Median      3Q     Max 
-17.681 -10.241  -4.269   5.879  63.608 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.704748   1.555704  -1.739    0.083 .  
trend        0.072121   0.007307   9.870   <2e-16 ***
days.L      -0.781163   2.060793  -0.379    0.705    
days.Q      -1.353806   2.057072  -0.658    0.511    
days.C      -0.021379   2.055235  -0.010    0.992    
days^4       0.527653   2.053988   0.257    0.797    
days^5      -0.465548   2.049616  -0.227    0.820    
days^6       0.362088   2.046213   0.177    0.860    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 14.89 on 360 degrees of freedom
Multiple R-squared:  0.2143,    Adjusted R-squared:  0.199 
F-statistic: 14.03 on 7 and 360 DF,  p-value: 4.214e-16

It seems that days are insignificant. They will be removed from the model. Residual analysis will be made on the final model. Now, we will check for the possible regressors.

temp <- lm(sold_count~trend, data=prod3)
prod3[, residuals1:=temp$residuals]
corr <- round(cor(Filter(is.numeric, prod3[!is.na(prod3$price)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

The maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod3$residuals1, prod3$basket_count)

model3_2 <- lm(sold_count~trend+shift(basket_count,1), data=prod3)
summary(model3_2)
Call:
lm(formula = sold_count ~ trend + shift(basket_count, 1), data = prod3)

Residuals:
     Min       1Q   Median       3Q      Max 
-29.0056  -0.8440  -0.8036  -0.7433  31.6141 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            0.7406904  0.6825720   1.085    0.279    
trend                  0.0004191  0.0036850   0.114    0.910    
shift(basket_count, 1) 0.1566509  0.0040083  39.081   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.428 on 364 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.8507,    Adjusted R-squared:  0.8499 
F-statistic:  1037 on 2 and 364 DF,  p-value: < 2.2e-16

Adjusted Rsqr value is 0.8499. From the correlation tableau, let’s find a second regressor.

prod3[, residuals2:=c(NA, model3_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod3[!is.na(prod3$price) & !is.na(prod3$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

It seems that there is not another good regressor. Next, the lagged sold_count will be examined.

ccf(prod3$residuals2, prod3$sold_count, na.action=na.pass)

model3_3 <- lm(sold_count~trend+shift(basket_count,1)+shift(sold_count,1), data=prod3)
summary(model3_3)
Call:
lm(formula = sold_count ~ trend + shift(basket_count, 1) + shift(sold_count, 
    1), data = prod3)

Residuals:
     Min       1Q   Median       3Q      Max 
-30.8354  -0.8569  -0.6361  -0.4246  26.4860 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)            0.284785   0.665472   0.428    0.669    
trend                  0.002312   0.003580   0.646    0.519    
shift(basket_count, 1) 0.104113   0.010866   9.581  < 2e-16 ***
shift(sold_count, 1)   0.319444   0.061730   5.175 3.78e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.212 on 363 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.8609,    Adjusted R-squared:  0.8598 
F-statistic: 749.2 on 3 and 363 DF,  p-value: < 2.2e-16
checkresiduals(model3_3)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 39.465, df = 10, p-value = 2.104e-05

In the last model, adjusted Rsqr has increased to 0.8598, which is a good value. But when it comes to the residual analysis, there are some problems. Because the prod3 data was dirty, the residuals are also dirty. We cannot say that the variance is constant over time. Plus, the residuals are not normally distributed but the mean of the residuals are near zero. Also, there is not a significant autocorrelation. model3_3 is the final model.

prod3[, preds:=c(NA,model3_3$fitted.values)]
ggplot(prod3, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red") 

Forecasting

forecast_data_bikini2 <- data.table(event_date=bikini2[event_date>=test_start&event_date<=test_end,]$event_date,
                                 sold_count=bikini2[event_date>=test_start&event_date<=test_end,]$sold_count)
bikini2[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
sarima_fc <- numeric(0)
sarimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- bikini2[event_date<test_dates[i],]
  model_sarima <- Arima(train_dt$sold_count,order=c(3,1,3),seasonal=c(1,0,1))
  model_sarimax <- Arima(train_dt$sold_count,order=c(3,1,3),seasonal=c(1,0,1),xreg=train_dt$diff_basket)
  model_reg <- lm(sold_count~trend+shift(basket_count,1)+shift(sold_count,1), data=train_dt)
  newreg <- forecast(auto.arima(train_dt$diff_basket),h=1)$mean[1]
  sarima_temp <- forecast(model_sarima)
  sarimax_temp <- forecast(model_sarimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=bikini2[event_date %in% c(test_dates[i],test_dates[i+1])])
  sarima_fc <- c(sarima_fc,sarima_temp$mean[1])
  sarimax_fc <- c(sarimax_fc,sarimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_bikini2 <- forecast_data_bikini2[,`:=`(sarima_p=sarima_fc,
                                               sarimax_p=sarimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_bikini2$sold_count,forecast_data_bikini2$sarima_p)
accu(forecast_data_bikini2$sold_count,forecast_data_bikini2$sarimax_p)
accu(forecast_data_bikini2[1:(.N-1)]$sold_count,forecast_data_bikini2[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 49.93333 11.71974 0.2347078 0.01706733 0.194244 11.81797 9.25744 0.185396 0.185396
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 49.93333 11.71974 0.2347078 0.01097355 0.217304 13.23445 10.65657 0.2134159 0.2134159
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 50.85714 11.58153 0.2277267 0.1489639 0.1521629 10.45751 8.379918 0.1647737 0.1647737
result_data_bikini2 <- data.table(event_date=bikini2[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=bikini2[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- bikini2[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+shift(basket_count,1)+shift(sold_count,1), data=train_dt)
  final_temp <- predict(model_final,newdata=bikini2[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_bikini2 <- result_data_bikini2[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_bikini2[1:(.N-1)]$sold_count,result_data_bikini2[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 72.5 21.43595 0.2956683 0.08923879 0.09759226 9.705456 7.737914 0.1067299 0.1067299
data.frame(event_date=result_data_bikini2[1:(.N-1)]$event_date,
           sold_count=result_data_bikini2[1:(.N-1)]$sold_count,
           predictions=result_data_bikini2[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 41 35.11754
2021-06-13 73 63.48856
2021-06-14 88 75.21750
2021-06-15 88 72.36864
2021-06-16 104 81.69486
2021-06-17 102 109.83913
2021-06-18 76 72.34001
2021-06-19 43 44.03759
2021-06-20 76 72.16040
2021-06-21 53 50.71460
2021-06-22 84 74.83149
2021-06-23 87 77.07839
2021-06-24 48 45.93712
2021-06-25 52 49.59681

Trendyolmilla Bikini1

ggplot(prod2, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod2$sold_count) 

There is decrease in ACF graph which may mean that there is a trend on the data.

Weekly Decomposition

Additive decomposition will be used because the variance does not increase.

prod2_weekly_dec <- decompose(ts(prod2$sold_count, freq=7), type="additive")
plot(prod2_weekly_dec)

plot(prod2_weekly_dec$random)

u_test <- ur.kpss(prod2_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0128 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the KPSS test above, the data is stationary.

acf(prod2_weekly_dec$random, na.action=na.pass)
pacf(prod2_weekly_dec$random, na.action=na.pass)

There are a little correlation at lag 2 and lag 5. We can see them on the PACF graph.

Monthly Decomposition

prod2_monthly <- prod2[, month:=yearmonth(event_date)]
prod2_monthly[, .(sold_count=sum(sold_count)), by=month]
prod2_monthly[, trend:=1:.N]

prod2_reg <- lm(sold_count~trend+as.factor(month), data=prod2_monthly)
prod2_monthly[, regOutput:=predict(prod2_reg, prod2_monthly)]

prod2_monthly[, random:=sold_count-regOutput]

ggplot(prod2_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 0
2020 Jun 0
2020 Jul 0
2020 Aug 0
2020 Sep 0
2020 Oct 0
2020 Nov 0
2020 Dec 0
2021 Jan 9
2021 Feb 514
2021 Mar 136
2021 Apr 276
2021 May 4057

u_test <- ur.kpss(prod2_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0318 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

From the unit test, the data is stationary.

acf(prod2_monthly$random, na.action=na.pass)
pacf(prod2_monthly$random, na.action=na.pass)

The first lag has a high correlation value.

Building an ARIMA Model

bikini1 <- data[product_content_id=="73318567",][order(event_date)]
bikini1_train <- bikini1[event_date<test_start,]
bikini1_ts <- ts(bikini1_train$sold_count)
ggplot(bikini1_train,aes(x=event_date,y=sold_count)) + geom_line() +
  labs(title= "Graph of Trendyolmilla Bikini1", x= "Date", y="Quantity")

ggplot(bikini1_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(bikini1_train$sold_count,1))) + 
  labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(bikini1_train$sold_count,1))
pacf(diff(bikini1_train$sold_count,1))

According to the ACF and PACF graphs, AR(2), AR(3) and MA(2), MA(3) are good candidates for an ARIMA model.

arima(bikini1_ts,order=c(3,1,2))
Call:
arima(x = bikini1_ts, order = c(3, 1, 2))

Coefficients:
         ar1      ar2     ar3      ma1      ma2
      0.3267  -0.0124  0.1759  -0.5086  -0.1640
s.e.  0.3218   0.1367  0.0957   0.3207   0.1387

sigma^2 estimated as 147.1:  log likelihood = -1436.72,  aic = 2885.43
arima(bikini1_ts,order=c(2,1,3))
Call:
arima(x = bikini1_ts, order = c(2, 1, 3))

Coefficients:
          ar1      ar2     ma1     ma2      ma3
      -1.4748  -0.7018  1.4065  0.2774  -0.3920
s.e.   0.0476   0.0475  0.0623  0.1086   0.0586

sigma^2 estimated as 131:  log likelihood = -1417.67,  aic = 2847.33
auto.arima(bikini1_ts)
Series: bikini1_ts 
ARIMA(2,1,0) 

Coefficients:
          ar1      ar2
      -0.1866  -0.2597
s.e.   0.0512   0.0511

sigma^2 estimated as 148.5:  log likelihood=-1437.4
AIC=2880.81   AICc=2880.87   BIC=2892.52

The lowest AIC value is 2847.33 on the ARIMA(2,1,3)

model_bikini1_arima <- arima(bikini1_ts,order=c(2,1,1))
checkresiduals(model_bikini1_arima)
bikini1_train$res1 <- model_bikini1_arima$residuals
corr <- cor(bikini1_train[!is.na(bikini1_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(2,1,1)
Q* = 36.21, df = 7, p-value = 6.616e-06

Model df: 3.   Total lags used: 10

There is no significant correlation between residuals and other variables because the nature of the data.

Linear Models

Again, the days of week and trend columns will be added to the data firstly. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod2[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model2_1 <- lm(sold_count~trend+days, data=prod2)
summary(model2_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod2)

Residuals:
    Min      1Q  Median      3Q     Max 
-43.973 -21.974  -6.404  10.939 193.487 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -21.92474    3.81514  -5.747 1.94e-08 ***
trend         0.19238    0.01792  10.736  < 2e-16 ***
days.L        1.07026    5.05380   0.212    0.832    
days.Q        1.50219    5.04467   0.298    0.766    
days.C       -0.96008    5.04016  -0.190    0.849    
days^4        3.25069    5.03711   0.645    0.519    
days^5        1.20394    5.02638   0.240    0.811    
days^6       -0.06179    5.01804  -0.012    0.990    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 36.52 on 360 degrees of freedom
Multiple R-squared:  0.2438,    Adjusted R-squared:  0.2291 
F-statistic: 16.58 on 7 and 360 DF,  p-value: < 2.2e-16

It seems that days are insignificant. They will be removed from the model. Residual analysis will be made on the final model. Now, we will check for the possible regressors.

temp <- lm(sold_count~trend, data=prod2)
prod2[, residuals1:=temp$residuals]
corr <- round(cor(Filter(is.numeric, prod2[!is.na(prod2$price)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

Again, the maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod2$residuals1, prod2$basket_count)

model2_2 <- lm(sold_count~trend+shift(basket_count,1), data=prod2)
summary(model2_2)
Call:
lm(formula = sold_count ~ trend + shift(basket_count, 1), data = prod2)

Residuals:
    Min      1Q  Median      3Q     Max 
-82.115  -0.964   0.065   1.154  93.142 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -1.963759   1.562898  -1.256    0.210    
trend                   0.012172   0.008103   1.502    0.134    
shift(basket_count, 1)  0.212920   0.004756  44.767   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 14.24 on 364 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.8836,    Adjusted R-squared:  0.883 
F-statistic:  1382 on 2 and 364 DF,  p-value: < 2.2e-16

Adjusted Rsqr value is 0.883, which is a very high value. From the correlation tableau, let’s find a second regressor.

prod2[, residuals2:=c(NA, model2_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod2[!is.na(prod2$price) & !is.na(prod2$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

It seems that there is not another good regressor. Next, the lagged sold_count will be examined.

ccf(prod2$residuals2, prod2$sold_count, na.action=na.pass)

model2_3 <- lm(sold_count~trend+shift(basket_count,1)+shift(sold_count,3), data=prod2)
summary(model2_3)
Call:
lm(formula = sold_count ~ trend + shift(basket_count, 1) + shift(sold_count, 
    3), data = prod2)

Residuals:
    Min      1Q  Median      3Q     Max 
-83.869  -1.011   0.015   0.743  87.559 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -1.348168   1.448190  -0.931    0.353    
trend                   0.008229   0.007490   1.099    0.273    
shift(basket_count, 1)  0.147325   0.008761  16.817   <2e-16 ***
shift(sold_count, 3)    0.348722   0.040443   8.623   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 13.02 on 361 degrees of freedom
  (3 observations deleted due to missingness)
Multiple R-squared:  0.9035,    Adjusted R-squared:  0.9027 
F-statistic:  1126 on 3 and 361 DF,  p-value: < 2.2e-16
checkresiduals(model2_3)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 95.276, df = 10, p-value = 4.785e-16

Adjusted Rsqr value is 0.9027. Also in the first lags, there is not a significantly high correlation. But as we saw on the prod3, the data is dirty which makes the residuals dirty. The mean is near zero but the variance changes. Also, normality assumption is not satisfied. The last model is model2_3.

prod2[, preds:=c(NA,NA,NA,model2_3$fitted.values)]
ggplot(prod2, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red") 

Forecasting

forecast_data_bikini1 <- data.table(event_date=bikini1[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=bikini1[event_date>=test_start&event_date<=test_end,]$sold_count)
bikini1[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
arima_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- bikini1[event_date<test_dates[i],]
  model_arima <- Arima(train_dt$sold_count,order=c(3,1,4))
  model_reg <- lm(sold_count~trend+shift(basket_count,1)+shift(sold_count,1), data=train_dt)
  reg_temp <- predict(model_reg,newdata=bikini1[event_date %in% c(test_dates[i],test_dates[i+1])])
  arima_temp <- forecast(model_arima)
  arima_fc <- c(arima_fc,arima_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_bikini1 <- forecast_data_bikini1[,`:=`(arima_p=arima_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_bikini1$sold_count,forecast_data_bikini1$arima_p)
accu(forecast_data_bikini1[1:(.N-1)]$sold_count,forecast_data_bikini1[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 114.0667 82.37585 0.7221729 -0.1176104 0.4484135 44.3945 35.1883 0.3084889 0.3084889
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 119.0714 83.08519 0.697776 -0.01316139 0.02326047 3.757718 2.152989 0.01808149 0.01808149
result_data_bikini1 <- data.table(event_date=bikini1[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=bikini1[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- bikini1[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=train_dt)
  final_temp <- predict(model_final,newdata=bikini1[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_bikini1 <- result_data_bikini1[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_bikini1[1:(.N-1)]$sold_count,result_data_bikini1[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 41.78571 18.06049 0.4322169 -0.03752398 0.07941104 2.953998 2.405 0.05755555 0.05755555
data.frame(event_date=result_data_bikini1[1:(.N-1)]$event_date,
           sold_count=result_data_bikini1[1:(.N-1)]$sold_count,
           predictions=result_data_bikini1[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 39 40.76201
2021-06-13 44 40.40899
2021-06-14 67 67.80812
2021-06-15 44 46.45402
2021-06-16 72 71.40587
2021-06-17 68 67.41885
2021-06-18 46 51.32375
2021-06-19 26 28.82349
2021-06-20 15 13.90706
2021-06-21 20 23.38950
2021-06-22 47 49.36196
2021-06-23 40 41.67848
2021-06-24 37 37.82823
2021-06-25 20 26.38120

Altinyildiz Classics Mont

ggplot(prod1, aes(x=event_date, y=sold_count)) + geom_line() + geom_smooth(se=F)

acf(prod1$sold_count) 

There is a relatively high autocorrelation with lag 1.

Weekly Decomposition

Additive decomposition will be used because the variance does not increase.

prod1_weekly_dec <- decompose(ts(prod1$sold_count, freq=7), type="additive")
plot(prod1_weekly_dec)

plot(prod1_weekly_dec$random)

Now, let’s check whether the weekly detrended and deseasonalized data is stationary.

u_test <- ur.kpss(prod1_weekly_dec$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0093 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

We dont have enough evidence to reject the null hyphothesis. The data is now stationary.

acf(prod1_weekly_dec$random, na.action=na.pass)
pacf(prod1_weekly_dec$random, na.action=na.pass)

From the PACF graph above, the data is correlated with its first lags.

Monthly Decomposition

prod1_monthly <- prod1[, month:=yearmonth(event_date)]
prod1_monthly[, .(sold_count=sum(sold_count)), by=month]
prod1_monthly[, trend:=1:.N]

prod1_reg <- lm(sold_count~trend+as.factor(month), data=prod1_monthly)
prod1_monthly[, regOutput:=predict(prod1_reg, prod1_monthly)]

prod1_monthly[, random:=sold_count-regOutput]

ggplot(prod1_monthly, aes(x=event_date, y=random)) + geom_line()
month sold_count
2020 May 0
2020 Jun 0
2020 Jul 0
2020 Aug 0
2020 Sep 1
2020 Oct 197
2020 Nov 54
2020 Dec 4
2021 Jan 9
2021 Feb 2
2021 Mar 0
2021 Apr 0
2021 May 33

u_test <- ur.kpss(prod1_monthly$random)
summary(u_test)
####################### 
# KPSS Unit Root Test # 
####################### 

Test is of type: mu with 5 lags. 

Value of test-statistic is: 0.0199 

Critical value for a significance level of: 
                10pct  5pct 2.5pct  1pct
critical values 0.347 0.463  0.574 0.739

According to the test above, the data is stationary.

acf(prod1_monthly$random, na.action=na.pass)
pacf(prod1_monthly$random, na.action=na.pass)

There is not a high autocorrelation but we can say that there is a little correlation with lag 1.

Building an ARIMA Model

mont <- data[product_content_id=="48740784",][order(event_date)]
mont$is_winter <- 0
mont[month(mont$event_date) %in% c(9,10,11,12,1),]$is_winter <- 1
mont_train <- mont[event_date<test_start,]
mont_ts <- ts(mont_train$sold_count)
ggplot(mont_train,aes(x=event_date,y=sold_count)) + geom_line() +
  labs(title= "Graph of Altınyıldız Mont", x= "Date", y="Quantity")

ggplot(mont_train[2:.N,],aes(x=event_date)) + geom_line(aes(y=diff(mont_train$sold_count,1))) + 
  labs(title= "Graph of Differenced Series", x= "Date", y="Quantity")

acf(diff(mont_train$sold_count,1))
pacf(diff(mont_train$sold_count,1))

There are spikes at the ACF graph at lags 1 and 4. There are also spikes at the PACF graph at lags 1,2,4. These paramters could be tried on a ARIMA model.

arima(mont_ts,order=c(1,1,1))
Call:
arima(x = mont_ts, order = c(1, 1, 1))

Coefficients:
         ar1      ma1
      0.2693  -0.9204
s.e.  0.0574   0.0229

sigma^2 estimated as 10.08:  log likelihood = -945.38,  aic = 1896.77
arima(mont_ts,order=c(1,1,2))
Call:
arima(x = mont_ts, order = c(1, 1, 2))

Coefficients:
          ar1      ma1      ma2
      -0.5739  -0.0884  -0.7141
s.e.   0.0946   0.0740   0.0595

sigma^2 estimated as 9.949:  log likelihood = -943.09,  aic = 1894.18
arima(mont_ts,order=c(1,1,4))
Call:
arima(x = mont_ts, order = c(1, 1, 4))

Coefficients:
          ar1     ma1      ma2      ma3      ma4
      -0.6211  0.0269  -0.6240  -0.1038  -0.1254
s.e.   0.1219  0.1236   0.0885   0.0690   0.0581

sigma^2 estimated as 9.747:  log likelihood = -939.32,  aic = 1890.65
arima(mont_ts,order=c(4,1,1))
Call:
arima(x = mont_ts, order = c(4, 1, 1))

Coefficients:
         ar1      ar2      ar3      ar4      ma1
      0.2101  -0.0868  -0.0331  -0.1679  -0.8551
s.e.  0.0632   0.0569   0.0560   0.0566   0.0418

sigma^2 estimated as 9.802:  log likelihood = -940.34,  aic = 1892.69
arima(mont_ts,order=c(4,1,2))
Call:
arima(x = mont_ts, order = c(4, 1, 2))

Coefficients:
          ar1     ar2      ar3      ar4      ma1      ma2
      -0.4675  0.0942  -0.0647  -0.1973  -0.1413  -0.6477
s.e.   0.0967  0.0734   0.0654   0.0575   0.0885   0.0838

sigma^2 estimated as 9.53:  log likelihood = -935.27,  aic = 1884.53
arima(mont_ts,order=c(4,1,4))
Call:
arima(x = mont_ts, order = c(4, 1, 4))

Coefficients:
          ar1     ar2     ar3      ar4      ma1      ma2      ma3     ma4
      -0.5644  0.2577  0.0891  -0.2304  -0.0394  -0.8788  -0.0851  0.1928
s.e.   0.3005  0.4435  0.2185   0.0758   0.3065   0.3697   0.2417  0.2992

sigma^2 estimated as 9.514:  log likelihood = -934.96,  aic = 1887.92

The best model is ARIMA(4,1,2) with lowest AIC value of 1884.53.

model_mont_arima <- arima(mont_ts,order=c(4,1,2))
checkresiduals(model_mont_arima)
mont_train$res1 <- model_mont_arima$residuals
corr <- cor(mont_train[!is.na(mont_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(4,1,2)
Q* = 7.1981, df = 4, p-value = 0.1258

Model df: 6.   Total lags used: 10

Resdiuals looks statioanry with 0 mean and constant variance. Basket_count could be added as a regressor.

ggplot(mont_train,aes(x=res1,y=basket_count)) +  geom_point() + geom_smooth()

ggplot(mont_train[2:.N],aes(x=res1)) +  geom_point(aes(y=diff(mont_train$basket_count,1))) +
  geom_smooth(aes(y=diff(mont_train$basket_count,1)))

cor(mont_train$res1,mont_train$basket_count)

0.646430054049636

arima(mont_ts,order=c(4,1,2),xreg=mont_train$basket_count)
Call:
arima(x = mont_ts, order = c(4, 1, 2), xreg = mont_train$basket_count)

Coefficients:
          ar1     ar2     ar3      ar4      ma1      ma2
      -0.4177  0.4710  0.3093  -0.0730  -0.3040  -0.6960
s.e.   0.1414  0.0639  0.0642   0.0631   0.1345   0.1342
      mont_train$basket_count
                       0.1877
s.e.                   0.0049

sigma^2 estimated as 1.845:  log likelihood = -635.55,  aic = 1287.1

AIC value is decreased when the regressor added.

model_mont_arimax <- arima(mont_ts,order=c(4,1,2),xreg=mont_train$basket_count)
checkresiduals(model_mont_arimax)
mont_train$res2 <- model_mont_arimax$residuals
corr <- cor(mont_train[!is.na(mont_train$price),c(-1,-2)])
ggcorrplot(corr,hc.order = TRUE, type = "lower")
    Ljung-Box test

data:  Residuals from ARIMA(4,1,2)
Q* = 13.039, df = 3, p-value = 0.004552

Model df: 7.   Total lags used: 10

Linear Models

Firstly, the days of week and trend columns will be added to the data. Next, the first model will be built with trend and weekly seasonality. Then, we’ll look for possible regressors and add them to the model. Finally, we will find the best model among them.

prod1[, `:=`(trend=1:.N, 
            days=wday(event_date, label=T))]

model1_1 <- lm(sold_count~trend+days, data=prod1)
summary(model1_1)
Call:
lm(formula = sold_count ~ trend + days, data = prod1)

Residuals:
   Min     1Q Median     3Q    Max 
-1.920 -0.764 -0.651 -0.408 50.140 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)  
(Intercept)  0.8847191  0.3713541   2.382   0.0177 *
trend       -0.0003902  0.0017443  -0.224   0.8231  
days.L      -0.0031543  0.4919214  -0.006   0.9949  
days.Q      -0.8765138  0.4910331  -1.785   0.0751 .
days.C      -0.1295795  0.4905946  -0.264   0.7918  
days^4       0.4566335  0.4902969   0.931   0.3523  
days^5       0.1469379  0.4892534   0.300   0.7641  
days^6      -0.6584222  0.4884411  -1.348   0.1785  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.554 on 360 degrees of freedom
Multiple R-squared:  0.01666,   Adjusted R-squared:  -0.002462 
F-statistic: 0.8713 on 7 and 360 DF,  p-value: 0.5293

The model is totally meaningless. We have to find good regressors for the model.

corr <- round(cor(Filter(is.numeric, prod1[!is.na(prod1$price)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

Again, the maximum correlation with residuals1 is obtained by basket_count. Therefore, its lagged version will be added to the model. To find the lag value, we’ll check the CCF function.

ccf(prod1$sold_count, prod1$basket_count)

model1_2 <- lm(sold_count~shift(basket_count,1), data=prod1)
summary(model1_2)
Call:
lm(formula = sold_count ~ shift(basket_count, 1), data = prod1)

Residuals:
   Min     1Q Median     3Q    Max 
-9.872 -0.401 -0.401 -0.401 50.732 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)            0.401179   0.175732   2.283    0.023 *  
shift(basket_count, 1) 0.078827   0.009059   8.702   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.239 on 365 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.1718,    Adjusted R-squared:  0.1695 
F-statistic: 75.72 on 1 and 365 DF,  p-value: < 2.2e-16

According to the p value, the model is meaningful with an adjusted Rsqr value of 0.1695. Let’s look for the second regressor.

prod1[, residuals2:=c(NA, model1_2$residuals)]
corr <- round(cor(Filter(is.numeric, prod1[!is.na(prod1$price) & !is.na(prod1$residuals2)])), 1)
ggcorrplot(corr, hc.order = TRUE,type = "lower")

It seems that there is not a good second regressor. Now, we can examine the lagged sold_count.

ccf(prod1$residuals2, prod1$sold_count, na.action=na.pass)

model1_3 <- lm(sold_count~shift(basket_count,1)+shift(sold_count,1), data=prod1)
summary(model1_3)
Call:
lm(formula = sold_count ~ shift(basket_count, 1) + shift(sold_count, 
    1), data = prod1)

Residuals:
    Min      1Q  Median      3Q     Max 
-12.095  -0.422  -0.422  -0.422  50.913 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)  
(Intercept)             0.42174    0.17521   2.407   0.0166 *
shift(basket_count, 1)  0.03979    0.02079   1.913   0.0565 .
shift(sold_count, 1)    0.22780    0.10933   2.084   0.0379 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.225 on 364 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.1816,    Adjusted R-squared:  0.1771 
F-statistic: 40.38 on 2 and 364 DF,  p-value: < 2.2e-16
checkresiduals(model1_3)
    Breusch-Godfrey test for serial correlation of order up to 10

data:  Residuals
LM test = 33.707, df = 10, p-value = 0.000207

The adjusted Rsqr value is very low. Linear regression assumptions are not satisfied but there is not a good regressors for this model. Therefore, the final model is model1_3.

prod1[, preds:=c(NA,model1_3$fitted.values)]
ggplot(prod1, aes(x=preds, y=sold_count)) + geom_point() + geom_abline(intercept=0, slope=1, color="red")

Forecasting

forecast_data_mont <- data.table(event_date=mont[event_date>=test_start&event_date<=test_end,]$event_date,
                            sold_count=mont[event_date>=test_start&event_date<=test_end,]$sold_count)
mont[,`:=`(trend=1:.N,
           days=wday(event_date, label=T))]
arima_fc <- numeric(0)
arimax_fc <- numeric(0)
reg_fc <- numeric(0)
for(i in 1:length(test_dates)){
  
  train_dt <- mont[event_date<test_dates[i],]
  model_arima <- Arima(train_dt$sold_count,order=c(4,1,2))
  model_arimax <- Arima(train_dt$sold_count,order=c(4,1,2),xreg=train_dt$basket_count)
  newreg <- forecast(auto.arima(train_dt$basket_count),h=1)$mean[1]
  model_reg <- lm(sold_count~shift(basket_count,1)+shift(sold_count,1), data=train_dt)
  arima_temp <- forecast(model_arima)
  arimax_temp <- forecast(model_arimax,xreg=newreg)
  reg_temp <- predict(model_reg,newdata=mont[event_date %in% c(test_dates[i],test_dates[i+1])])
  arima_fc <- c(arima_fc,arima_temp$mean[1])
  arimax_fc <- c(arimax_fc,arimax_temp$mean[1])
  reg_fc <- c(reg_fc,reg_temp)
  
}
reg_fc[is.na(reg_fc)] <- mean(reg_fc[!is.na(reg_fc)])
forecast_data_mont <- forecast_data_mont[,`:=`(arima_p=arima_fc,
                                               arimax_p=arimax_fc,
                                               reg_p=reg_fc[c(1,seq(2,28,2))])]
accu(forecast_data_mont$sold_count,forecast_data_mont$arima_p)
accu(forecast_data_mont$sold_count,forecast_data_mont$arimax_p)
accu(forecast_data_mont[1:(.N-1)]$sold_count,forecast_data_mont[2:.N]$reg_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 2.333333 1.676163 0.7183558 0.08981909 Inf 1.965482 1.658188 0.7106522 0.7106522
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
15 2.333333 1.676163 0.7183558 0.2055338 Inf 1.8071 1.566158 0.6712104 0.6712104
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 2.5 1.60528 0.6421119 0.331908 Inf 1.334793 1.104169 0.4416677 0.4416677
result_data_mont <- data.table(event_date=mont[event_date>=result_start&event_date<=result_end,]$event_date,
                               sold_count=mont[event_date>=result_start&event_date<=result_end,]$sold_count)
final_fc <- numeric(0)
for(i in 1:length(result_dates)){
  
  train_dt <- mont[event_date<result_dates[i],]
  model_final <- lm(sold_count~trend+days+shift(basket_count, 1)+shift(sold_count, 1), data=train_dt)
  final_temp <- predict(model_final,newdata=mont[event_date %in% c(result_dates[i],result_dates[i+1])])
  final_fc <- c(final_fc,final_temp)
  
}
final_fc[is.na(final_fc)] <- mean(final_fc[!is.na(final_fc)])
result_data_mont <- result_data_mont[,`:=`(final_p=final_fc[c(1,seq(2,28,2))])]
accu(result_data_mont[1:(.N-1)]$sold_count,result_data_mont[2:.N]$final_p)
n mean sd CV FBias MAPE RMSE MAD MADP WMAPE
14 2.142857 1.657484 0.7734925 0.3304018 Inf 1.331044 1.081682 0.5047848 0.5047848
data.frame(event_date=result_data_mont[1:(.N-1)]$event_date,
           sold_count=result_data_mont[1:(.N-1)]$sold_count,
           predictions=result_data_mont[2:.N]$final_p)
event_date sold_count predictions
2021-06-12 2 1.2841955
2021-06-13 0 0.8663982
2021-06-14 5 2.4952462
2021-06-15 4 3.5127946
2021-06-16 5 2.1635188
2021-06-17 0 0.8157084
2021-06-18 3 1.0377658
2021-06-19 0 0.4540381
2021-06-20 1 1.1227687
2021-06-21 2 1.1356978
2021-06-22 2 2.3568327
2021-06-23 2 0.8385431
2021-06-24 2 1.1292112
2021-06-25 2 0.8752279